*** TABULA RASA ***
clear all

*** FILENAME & FILE-SPECIFIC GLOBALS ***
glo filename 	1prepare
glo date 		= string( d(`c(current_date)'), "%tdCYND" )

*** SETTINGS ***
set more off 
set linesize 200        
set rmsg on            
set maxvar 32767, permanently
set excelxlsxlargefile on 

*** START LOG-FILE ***
if ${log_active}==1 {
	mac list date
	capture log close
	log using ${log}/${filename}_${date}.log, replace text
	}

********************************************************************************
*** 1prepare.do ****************************************************************
*** Data preparation************************************************************
*** last change: 2023/05/01 (LH)************************************************
********************************************************************************

* Contents of this file:	 							
* (1) Load IEB & Rename variables from IEB- to SIAB-parlance
* (2) Identify mining areas
* (3) Identify lignite spells and some spells mining AND services)	
*		SPELLLIG & SOMELIG
* (4) Identify lignite workers (mining AND services)
* 		LIFELIG	
* (5) Define Tentgeltdays (how many days worked per month)
* (6) Define Employment categories	/ status variable	 	
* (7) Sociodemographic and other individual characteristcs	
	* (7.a) Age
	* (7.b) Gender
	* (7.c) Education
	* (7.d) Occupations
	* (7.e) Industry sectors
* (8) Parallel spells  
	* (8a) Assigning observations to 5 mutually exclusive employment hours categories (UE, NE, ME, VOC, ALMP)
	* (8b) Identifying parallel spells of extra payment and adding to the wage of non-extra payment spell
	* (8c) Eliminate parallel spells
	* (8d) Imputing social benefits
	* (8e) Deflating incomes & Imputing above assessment ceiling
* (9) Collecting information on whole labour market biographies
*     (pre dropping of Black holes)      
	* (9a) Income variable from all sources
	* (9b) Specific lignite sector experience 
	* (9c) Capture date of potential retirement (endepi of last spell)
	* (9d) Capture date of potential early retirement for ATZ cases
* (10) Black holes                                      
* (11) Collating consecutive spells 				
/* (12) Generate datasets for estimation                                    */
	* (12.a) Drop individuals with too long or too many black holes
	*		(All the following datasets build on dataset created in 11a.)
	* 		-> postprep_1
	* (12.b) Sample that contains all remaining information for each individual,
	* 		i.e. black holes are an additional spell assigned the status 10 - 'black hole'.
	*		-> postprep_2
	* (12.c) Select only one series per person (no interruptions with long black holes):
	*		1) only lignite series, 2) longest series if multiple lignite series
	*		-> postprep_3
	* (12.d) Select only one series per person (no interruptions with long black holes): 
	*		1) only lignite series, 2) series with at least one transition (into or out of lignite)
	*		3) longest series if multiple lignite transition series
	*		-> postprep_4
			
/* ---------------------------------------------------------------------------*/
/* (1) Load IEB & Rename variables from IEB- to SIAB-parlance				  */
/* -------------------------------------------------------------------------- */
	
if ${iab}==1{
use ${orig}/IEB/a014526_ieb_epi.dta, clear

rename (ieb_quellverf_id 	ieb_beg_epi 	ieb_end_epi	ieb_beg_orig 	ieb_end_orig	ieb_staat_num 	ieb_dba_id 	isb_id 	ieb_tag_entg 	ieb_beruf_kons_num 	ieb_erw_stat_num 	gleitzone_num 	ieb_abg_num ieb_zug_gr 	ieb_sna_id 	ieb_rest_anspruch 	ieb_sgb_II_tr_art 	ieb_beg_alo 	ieb_dau_alo 	ieb_wo_krs_num 	ieb_wo_aa_num	wz03_num	wz08_kons_num berufstellg_imp_num		ieb_ao_gem_num	prs_id	) /// 
	   (quelle 				begepi			endepi		begorig 		endorig 		nation 			ausbildung 	schule 	tentgelt 		beruf 				erwstat 			gleitz 			grund 		estatvor 	estatnach 	restanspruch 		traeger 			alo_beg 		alo_dau 		wo_kreis 		wo_aa			w93_3		wz08 		  stib						ao_gem			persnr	)
}
if ${iab}==0{
use pre1prep.dta, clear
}
cap drop pid
g pid=persnr

/* ------------------------------------------------------------------------ */
/* (2) Identify mining areas												*/
/* ------------------------------------------------------------------------ */
	
	rename ieb_ao_krs_num ao_kreis		

	* Regional classification of the different German lignite mining areas (based on Kreis information!)
	gen mining_area = .

	/* Comment:
	Note we include these three Kreise, since we include lignite services, although they are/were only hosting power plants and no mine:
		* Kreisfreie Stadt Cottbus (belongs to the Lausitzer Revier)
		* Kreisfreie Stadt Chemnitz (belongs to the Mitteldt. Revier)
		* Kreisfreie Stadt Köln (belongs the the Rheinische Revier)*/
		
	* Mining Area 1: Lausitzer Revier
	replace mining_area = 1 if ao_kreis == 12052 // Stadt Cottbus (Brandenburg)
	replace mining_area = 1 if ao_kreis == 12061 // LK Dahme-Spreewald (Brandenburg)
	replace mining_area = 1 if ao_kreis == 12062 // LK Elbe-Elster (Brandenburg)
	replace mining_area = 1 if ao_kreis == 12066 // LK Oberspreewald-Lausitz (Brandenburg)
	replace mining_area = 1 if ao_kreis == 12071 // LK Spree-Neisse (Brandenburg)
	replace mining_area = 1 if ao_kreis == 14625 // LK Bautzen (Sachsen)
	replace mining_area = 1 if ao_kreis == 14626 // LK Goerlitz (Sachsen)
	
	* Mining Area 2: Mitteldeutsches Revier
	replace mining_area = 2 if ao_kreis == 14713 // Stadt Leipzig (Sachsen)
	replace mining_area = 2 if ao_kreis == 14729 // LK Leipzig (Sachsen)
	replace mining_area = 2 if ao_kreis == 14730 // LK Nordsachsen (Sachsen)
	replace mining_area = 2 if ao_kreis == 15002 // Stadt Halle(Saale)
	replace mining_area = 2 if ao_kreis == 15082 // LK Anhalt-Bitterfeld (Sachsen-Anhalt)
	replace mining_area = 2 if ao_kreis == 15084 // LK Burgenlandkreis (Sachsen-Anhalt)
	replace mining_area = 2 if ao_kreis == 15087 // LK Mansfeld-Südharz (Sachsen-Anhalt)
	replace mining_area = 2 if ao_kreis == 15088 // LK Saalekreis (Sachsen-Anhalt)
	replace mining_area = 2 if ao_kreis == 15091 // LK Wittenberg (Sachsen-Anhalt)
	replace mining_area = 2 if ao_kreis == 16077 // LK Altenburger Land (Thüringen)
	replace mining_area = 2 if ao_kreis == 16075 // LK Saale-Orla-Kreis (Thüringen)
	
	* Mining Area 3: Helmstedter Revier
	replace mining_area = 3 if ao_kreis == 03154 // LK Helmstedt (Niedersachsen)
	replace mining_area = 3 if ao_kreis == 15083 // LK Börde (Sachsen-Anhalt)

	* Mining Area 4: Rheinisches Revier
	replace mining_area = 4 if ao_kreis == 05112 // Stadt Duisburg (NRW)
	replace mining_area = 4 if ao_kreis == 05162 // LK Rhein-Kreis Neuss (NRW)
	replace mining_area = 4 if ao_kreis == 05315 // Stadt Köln (NRW)
	replace mining_area = 4 if ao_kreis == 05334 // LK Städteregion Aachen (NRW)
	replace mining_area = 4 if ao_kreis == 05362 // LK Rhein-Erft-Kreis (NRW)
	replace mining_area = 4 if ao_kreis == 05358 // LK Düren (NRW)
	replace mining_area = 4 if ao_kreis == 05366 // LK Euskirchen (NRW)
	
	* Mining Area 5: Other undefined Reviere
	replace mining_area = 5 if ao_kreis == 03153 // LK Goslar
	replace mining_area = 5 if ao_kreis == 03241 // Region Hannover
	replace mining_area = 5 if ao_kreis == 05570 // LK Warendorf
	replace mining_area = 5 if ao_kreis == 06440 // LK Wetteraukreis
	replace mining_area = 5 if ao_kreis == 06611 // Stadt Kassel
	replace mining_area = 5 if ao_kreis == 06634 // LK Schwalm-Eder-Kreis
	replace mining_area = 5 if ao_kreis == 06636 // LK Werra-Meißner-Kreis
	replace mining_area = 5 if ao_kreis == 09162 // Stadt München
	replace mining_area = 5 if ao_kreis == 09376 // LK Schwandorf
	replace mining_area = 5 if ao_kreis == 09672 // LK Bad Kissingen
	replace mining_area = 5 if ao_kreis == 10041 // Regionalverband Saarbrücken
	replace mining_area = 5 if ao_kreis == 10044 // LK Saarlouis
	replace mining_area = 5 if ao_kreis == 11000 // Stadt Berlin
	replace mining_area = 5 if ao_kreis == 13073 // LK Vorpommern-Rügen
	replace mining_area = 5 if ao_kreis == 14521 // LK Erzgebirgskreis	
	
label define areas 1 "Lausitzer Revier" 2 "Mitteldt. Revier" 3 "Helmstedter Revier" 4 "Rheinisches Revier" 5 "Other Reviere"
label val mining_area areas

if ${iab}==0{
replace mining_area= ceil(5 * uniform()) 
}
 
/* ------------------------------------------------------------------------ */
/* (3) Identify lignite spells (mining AND services)						*/
/* ------------------------------------------------------------------------ */
	
	* SPELLLIGMINE: 1st criterion for lignite spell: lignite mining 
			cap drop thisspellligmine
			gen thisspellligmine = 0
			if ${iab}==1 {
			replace thisspellligmine = 1 if (wz08 == 5200) // * lignite mining 
			}
			if ${iab}==0 {
			replace thisspellligmine = 1 if w93_3 == 102   // "102" is the code for the economic sector that includes the lignite industry.
			}
			tab thisspellligmine, m
		
	* SPELLLIGSERV: 2nd criterion for lignite spells: lignite services (i.e. mining services in lignite area)		
			cap drop thisspellligserv
			gen thisspellligserv = 0
			if ${iab}==1 {
			replace thisspellligserv = 1 if (wz08 == 9900 & (mining_area>0 & mining_area<6)) // * mining services for mining industry
			}
			if ${iab}==0 {
			replace thisspellligserv = 1 if (w93_3 == 211 & (mining_area>0 & mining_area<6))  // "102" is the code for the economic sector that includes the lignite industry.
			}
			tab thisspellligserv, m	
 
	* SPELLLIGNITE: All lignite spells - either spell in lignite mining or lignite services 
			cap drop thisspelllignite
			gen thisspelllignite = 0
			replace thisspelllignite = 1 if (thisspellligmine == 1 | thisspellligserv == 1)
			label var thisspelllignite "this spell is lignite industry (mining OR (services in lignite areas))"
			tab thisspelllignite, m
	
	* THISSPELLIG_CAT: Categorial variable for lignite mining spells and lignite service spells
			cap drop thisspelllig_cat
			gen thisspelllig_cat=.
			if ${iab}==1 {
			label var thisspelllig_cat "mining services (1); lignite mining (2); other (0); missing (.)"
			replace thisspelllig_cat=1 if wz08==9900 & mining_area>0 & mining_area<6
			replace thisspelllig_cat=2 if wz08==5200 
			replace thisspelllig_cat=0 if thisspelllig_cat==. & wz08>0
			replace thisspelllig_cat=. if wz08<0 |wz08==.
			label define thisspelllig_cat 1 "mining services", modify
			label define thisspelllig_cat 2 "lignite mining", modify 
			label define thisspelllig_cat 0 "other", modify 
			label val thisspelllig_cat thisspelllig_cat
			}
			if ${iab}==0 {
			label var thisspelllig_cat "mining services (1); lignite mining (2); other (0); missing (.)"
			replace thisspelllig_cat=1 if w93_3==211 & mining_area>0 & mining_area<6
			replace thisspelllig_cat=2 if w93_3==102 
			replace thisspelllig_cat=0 if thisspelllig_cat==. & w93_3>0
			replace thisspelllig_cat=. if w93_3<0 | w93_3==.
			}
			tab thisspelllig_cat, m
	
	* Check for reasons for deregistration	
	tab grund if thisspellligmine, m // lignite mining
	tab grund if thisspellligserv, m // lignite services
	
* 	  SOMELIGNITE: Some spell of an individual (this or a parallel one) is in lignite
*     comprises SOMELIGMINE (mining) AND SOMELIGSERV (services)	
			egen someligmine = max(thisspellligmine), by(persnr begepi)
			label var someligmine "during this episode, some spell was in lignite mining"
			
			egen someligserv = max(thisspellligserv), by(persnr begepi)
			label var someligserv "during this episode, some spell was in mining service in lignite area"
			tab someligserv, m

			g somelignite=.
			replace somelignite=1 if someligserv==1
			replace somelignite=1 if someligmine==1
			replace somelignite=0 if (someligmine==0 & someligserv==0)
			label var somelignite "during this episode, some spell was in lignite industry"
		 
/* ------------------------------------------------------------------------ */
/* (4) Identify lignite workers - LIFELIGNITE 								*/
/*     covering LIFELIGMINE (mining) AND LIFELIGSERV (services)				*/								*/
/* ------------------------------------------------------------------------ */ 
	 
cap drop lifeligmine 
bysort persnr: egen lifeligmine = max(thisspellligmine)
tab lifeligmine, m
 
cap drop lifeligserv 
bysort persnr: egen lifeligserv = max(thisspellligserv)
tab lifeligserv, m

cap drop lifelignite 
gen lifelignite = 0
replace lifelignite = 1 if lifeligmine == 1 
replace lifelignite = 1 if lifeligserv == 1 
tab lifelignite, m
 
 * KEEP ONLY WORKERS WITH LIGNITE IN THEIR BIOGRAPHY & SAVE AS IEB_LIGNITE.DTA
 
keep if lifelignite == 1  
count

save data\ieb_lignite.dta, replace

*** to save time, start analyses from here if no change in prior sample generation:
use data\ieb_lignite.dta, clear

/* ------------------------------------------------------------------------ */
 *  (5) DEFINE TENTGELTDAYS
/* ------------------------------------------------------------------------ */
	
			scalar workdays=20 
			* note that bank holidays vary across different counties, here average taken
			scalar calenderdays=30.45 
			* average workdays are 30.42 if no leapyear, 30.50 in leapyear

			* See IAB documentation: LEH daily wages refer to different monthly earnigns pre and post-1998
			g tentgeltdays=.
			replace tentgeltdays=workdays if (quelle==2) & year(endepi)<1998
			replace tentgeltdays=calenderdays if (quelle==2 & year(endepi)>=1998) | quelle !=2

/* ------------------------------------------------------------------------ */
/* (6) Define Employment categories	/ status variable	 													*/	
/* 		Assigning observations to 5 mutually exclusive employment hours categories (UE, NE, ME, VOC, ALMP) */
/* ------------------------------------------------------------------------ */

	* Defining normal employment (NE)
	gen normalemp=(erwstat == 101 | erwstat==103 | erwstat==118 | erwstat == 140 | erwstat==203 | erwstat==205 | erwstat==599)
	
	/* Note: Define full-time first since part-time underreported (2011 reform in reporting)*/
	* Defining marginal employment (ME)
	
		/* Further information: According to the documentation, any observation with erwstat
		of 109 or 209 is directly registered as marginal employed. Furthermore, any observation 
		which is registered via "Haushaltsscheck" (erwstat = 201) is defined as marginal employed,
		because its the only possibility to be registered like that. 	*/
		
	gen margemp = inlist(erwstat, 109, 209, 201)
	
		* We redefine normalemp spells with very low earnings (below margemp threshold) as marginal emp spells.

	forvalues i=1975/2017{
	replace margemp = 1   if (normalemp==1 & year(endepi)==`i' & (tentgelt*tentgeltdays) < 0.75*${margempme_`i'})
	replace normalemp = 0 if (normalemp==1 & year(endepi)==`i' & (tentgelt*tentgeltdays) < 0.75*${margempme_`i'})
	}		
	forvalues i=1975/2017{
	replace margemp = 0   if (margemp==1 & year(endepi)==`i' & (tentgelt*tentgeltdays) > 1.25*${margempme_`i'})
	replace normalemp = 1 if (margemp==1 & year(endepi)==`i' & (tentgelt*tentgeltdays) > 1.25*${margempme_`i'})
	}	
	
	*DelmeMB additional checking : does it have an impact on erwstat==103 ?
	disp("Check of erwstat after redifining normalemp spells with very low earnings as marginal emp spells")
	tab erwstat margemp
	
	* Defining vocational training (VOC)
			
	gen vocational = inlist(erwstat, 102, 105, 106, 121, 122, 123, 141, 144)
	
	* Defining unemployment (UE) - ALG I vs. ALG II 
		
	* Define ALG I
	gen unemp = inlist(erwstat, 31, 32, 61, 71, 91, 62, 72, 92)
		
	* we have information on ALG II - but do we have earnings of ALGII?
	count if quelle==4
		
	* Define ALG II
	if ${iab}==0{
	disp("note that erwstat numbers are not the same in testdata and actual data")
	}
	gen unemp2 = (erwstat==1 | erwstat==2)
	label var unemp2 "unemp benefits (ALG II) spell - often parallel to unemp / can be additional to emp"
		
	* Defining unemployment benefit spells (also UE)
	
		* Define ALG I (from LEH)
		gen unempbenefit1=(erwstat==11)
		
		* Check: erwstat 1 and 2 are all from LHG
		tab quelle if erwstat==1 | erwstat==2
		* Check: ALHI (from LEH)
		tab quelle if erwstat==12
		
		gen unempbenefit2 = (erwstat==12 | erwstat==1 | erwstat==2)
			
	* Note: Later (in deparallelizing) we will remove all benefit and NALO spells

	* Defining active labor market policies (ALMP)
	count if quelle==32
	count if erwstat>1000

	gen almp = (erwstat>1000 & erwstat!=.)
	
	* Defining Unterhaltsgeld (UHG) (from LEH)
	gen almpbenefit = (erwstat==13)
		
	* Defining "Not unemployed but searching" (NALO)	
	gen searchemp = inlist(erwstat, 33, 34, 35, 63, 64, 73, 74, 75, 93, 94, 95)
	label var searchemp "not unemployed but searching for a job"
	
	* Note: Deparallelizing aims at removing all benefit and NALO spells
	
	* Check if the statuses defined so far are mutually exclusive
	gen checkstatus = margemp + normalemp + unemp + unempbenefit1 + unempbenefit2 + almp + almpbenefit + vocational + searchemp
	
	* check: how many percent of all spells are in miscallaneous category
	tab checkstatus
	
	* Define a dummy for "not assigned to any status"
	gen miscstat=(checkstatus==0)
	
	* Generate a categoricall status variable
	
	/* Further information: 
	Further analysis builds upon categories 0 to 4. Hence, data prep aims at reassigning categories -4 to -1. Category -99 is ignored in the following.
	The order of coding the status variable is not random! Further code on clearing parallel spells, that is the rules for prioritizing different employments, 
	is based on the order defined here. */
		
	gen status = .
	replace status=-99 if miscstat==1
	drop checkstatus
	replace status=-4 if searchemp==1
	replace status=-3 if almpbenefit==1
	replace status=-2 if unempbenefit2==1
	replace status=-1 if unempbenefit1==1
	replace status=0 if unemp==1	
	replace status=1 if almp==1
	replace status=2 if margemp==1
	replace status=3 if normalemp==1
	replace status=4 if vocational==1
	
	* Note: We use "_" instead of blanks in label descriptions to avoid format problems in esttab output (csv).
	label define status -99 `"miscalleneous_(see_erwstat)"', modify
	label define status -4 `"searching_but_emp_(NALO)"', modify
	label define status -3 `"almp_benefit"', modify
	label define status -2 `"unemp_benefit_2"', modify
	label define status -1 `"unemp_benefit_1"', modify
	label define status 0 `"unemployed"', modify
	label define status 1 `"active_labour_mp"', modify
	label define status 2 `"marginal_employment"', modify
	label define status 3 `"normal_employment_(FT_or_PT)"', modify
	label define status 4 `"vocational_training"', modify
	label values status status
	
		* Check for missing values - should have none as all collected in misc (-99)
		count if missing(status)
		
		* Check if dummies are mutually exclusive
		gen exclusive_dummy = miscstat + searchemp + almpbenefit + unempbenefit1 + unempbenefit2 + unemp + almp + margemp + normalemp + vocational
		tab exclusive_dummy
		drop exclusive_dummy 
		
		* since we later change some status designations, good to keep original
		gen status_orig = status
	

/* ------------------------------------------------------------------------ */
/* (7) Sociodemographic and other individual characteristcs					*/
/* ------------------------------------------------------------------------ */
*	Sociodemographic and other individual characteristcs
	* (7.a) Age
	* (7.b) Gender
	* (7.c) Education
	* (7.d) Occupations
	* (7.e) Industry sectors
	
bys persnr (begepi): g firstpers=1 if [_n==1]
label var firstpers "first spell of that person in sample"
bys persnr (begepi): g lastpers=1 if [_n==_N]
label var lastpers "last spell of that person in sample"
	
* (7.a) Age
	gen jahrbeg = year(begepi)
	label var jahrbeg "year at begin of spell"
	gen agebeg = jahrbeg - year(geb_dat)
	label var agebeg "age at begin of spell"
	di "tab age at begin of first spell of sampled workers"
	tab agebeg if firstpers==1, m
	di "tab age at begin of last spell of sampled workers"
	tab agebeg if lastpers==1, m
	
	gen ageend = year(endepi) - year(geb_dat)
	label var ageend "age at end of spell"
	di "tab age at end of last spell of sampled workers"
	tab ageend if lastpers==1, m

	gen ageendcat = 1 if ageend >= 18 & ageend <= 30
	replace ageendcat = 2 if ageend > 30 & ageend <= 50
	replace ageendcat = 3 if ageend > 50
	label var ageendcat "age at end of spell by broad category"
	label define ageendcatLAB 1 "18-30 years old" 2 "30-50 years old" 3 "over 50 years old" 
	label values ageendcat ageendcatLAB
	tab ageendcat if firstpers==1, m
	
	** See section 6c on lignite and other work experience
	
* (7.b) Gender	
	if ${iab}==1 {
	tab sex_id, m
	gen frau = 0 if sex_id==1 // men
	replace frau = 1 if sex_id==2 // women
	label define frauLAB 0 "mann" 1 "frau" 
	label values frau frauLAB
}
	tab frau if firstpers==1
	
* (7.c) Education / CLEAN THE EDUCATION VARIABLE (IAB best practice)

*  (7.c.i) For better imputation, create approximation of labour market experience
*     => note we are double-counting parallel spells (as deparallelizing is below)
*	  => cannot move below deparallelizing as education impuation used for wage-imputation
*		=> and wages used for deparallelizing
	sort persnr begepi
			* Capture length of spells
			cap drop dur
			gen duremp = 0
			replace duremp=(endepi - begepi) + 1 if (status==3 | status==4)
			label var duremp "experience in labour market (normalemp or vocational training)"
			sum duremp, d
			
	* General labor market experience
		by persnr (begepi): g genexp_help=sum(duremp)
		label var genexp_help ``approximate L mkt experience including doublecounting''
		sum genexp_help, d

	// Prepare dataset
	// Generating a categorical variable that distinguishes the observations by education
	// Preparing the dataset - extrapolating educational informtion (following Fitzenberger et al. (2005))

	// Preparation 
	sort persnr begepi endepi
	tab ausbildung if quelle==1, m
	
	if ${iab}==1 {
	noisily do ${prog}/ieb_education_imputation.do
	}
	
	if ${iab}==0 {
	gen bild=ausbildung
	tab bild, gen(Dbild_)
	}
			
* (7.d) Occupations	
	
		* generate two-digit occupational category
		g beruf02=.
		label var beruf02 "two-digit occupational categories"
		replace beruf02=floor(beruf/1000)
		label define beruf02 11 "Land-,Tier-undForstwirtschaftsberufe"
		label define beruf02 12 "GartenbauberufeFloristik", add
		label define beruf02 21 "Rohstoffgewinnung-aufbereitung,Glas-Keramik", add
		label define beruf02 22 "KunststoffundHolzverarbeitung", add
		label define beruf02 23 "Papier-undDruckberufe,technischeMediengestaltung", add
		label define beruf02 24 "Metallerzeugung-bearbeitung,Metallbauberufe", add
		label define beruf02 25 "Maschinen-Fahrzeugtechnikberufe", add
		label define beruf02 26 "Mechatronik-Energie-Elektroberufe", add
		label define beruf02 27 "TechnischeForschungs-Entwicklungsberufe", add
		label define beruf02 28 "Textil-Lederberufe", add
		label define beruf02 29 "Lebensmittelherstellung-verarbeitung", add
		label define beruf02 31 "Bauplanungs-,Architektur-Vermessungsberufe", add
		label define beruf02 32 "Hoch-Tiefbauberufe", add
		label define beruf02 33 "(Innen-)Ausbauberufe", add
		label define beruf02 34 "Gebäude-versorgungstechnische Berufe", add
		label define beruf02 41 "Mathematik-,Biologie-,Chemie-Physikberufe", add
		label define beruf02 42 "Geologie-,Geografie-,Umweltschutzberufe", add
		label define beruf02 43 "Informatik-,Informations-,Kommunikationstechnologieberufe", add
		label define beruf02 51 "Verkehrs-,Logistikberufe(außerFahrzeugführung)", add 
		label define beruf02 52 "Führer/innenFahrzeug-,Transportgeräten", add
		label define beruf02 53 "Schutz-,Sicherheits-,Überwachungsberufe", add
		label define beruf02 54 "Reinigungsberufe", add
		label define beruf02 61 "Einkaufs-,Vertriebs-,Handelsberufe", add
		label define beruf02 62 "Verkaufsberufe", add
		label define beruf02 63 "Tourismus-,Hotel-,Gaststättenberufe", add
		label define beruf02 71 "Unternehmensführung,-organisation", add
		label define beruf02 72 "Finanzdienstleistungen,RechnungswesenSteuerberatung", add
		label define beruf02 73 "RechtundVerwaltung", add
		label define beruf02 81 "MedizinischeGesundheitsberufe", add
		label define beruf02 82 "NichtmedizinischeGesundheits-,Körperpflege-,wellnessberufe,Medizintechnik", add
		label define beruf02 83 "Erziehung,SozialeHauswirtschaftlicheBerufe,Theologie", add
		label define beruf02 84 "LehrendeAusbildendeBerufe", add
		label define beruf02 91 "Sprach-,literatur-,geistes-,gesellschafts-wirtschaftswissenschaftliche Berufe", add
		label define beruf02 92 "werbung,marketing,Medienberufe", add
		label define beruf02 93 "ProduktdesignKunsthandwerk", add
		label define beruf02 94 "DarstellungUnterhaltung", add
		label define beruf02 01 "Militaer", add
		label values beruf02 beruf02 
		tab beruf02 if firstpers==1 & beruf02>0, m
		tab beruf02 if lastpers==1 & beruf02>0, m

		g beruf12=.
		label var beruf12 "top ten occupations in mining + other"
		label define beruf12 1 "Maschinen-Fahrzeugtechnikberufe", add
		label define beruf12 2 "Unternehmensführung,-organisation", add
		label define beruf12 4 "Führer/innenFahrzeug-,Transportgeräten", add
		label define beruf12 5 "Metallerzeugung-bearbeitung,Metallbauberufe", add
		label define beruf12 6 "Verkehrs-,Logistikberufe(außerFahrzeugführung)", add 
		label define beruf12 7 "TechnischeForschungs-Entwicklungsberufe", add
		label define beruf12 8 "Hoch-Tiefbauberufe", add
		label define beruf12 9 "Rohstoffgewinnung-aufbereitung,Glas-Keramik", add
		label define beruf12 10 "Mechatronik-Energie-Elektroberufe", add
		label define beruf12 11 "Gebäude-versorgungstechnische Berufe", add
		label define beruf12 12 "Other occupations", add
		label values beruf12 beruf12 
		replace beruf12=1 if beruf02==25
		replace beruf12=2 if beruf02==71
		replace beruf12=4 if beruf02==52
		replace beruf12=5 if beruf02==24
		replace beruf12=6 if beruf02==51
		replace beruf12=7 if beruf02==27
		replace beruf12=8 if beruf02==32
		replace beruf12=9 if beruf02==21
		replace beruf12=10 if beruf02==26
		replace beruf12=11 if beruf02==34
		replace beruf12=12 if beruf12==.
		tab beruf12, m
		
* (7.e) Industry sectors

	if ${iab}==0 {

	* Gen categorical industry variable 
	gen wirtschaftszweige = 0
	replace wirtschaftszweige = 1 if inrange(w93_3,011,020)
	replace wirtschaftszweige = 2 if inlist(w93_3,050)
	replace wirtschaftszweige = 3 if inlist(w93_3,101,102,103,111,112,120,131,132,141,142,143,144,145)
	replace wirtschaftszweige = 4 if inrange(w93_3,151,223) | inrange(w93_3,232,372)
	replace wirtschaftszweige = 5 if inlist(w93_3,402,410)
	replace wirtschaftszweige = 6 if inrange(w93_3,451,455)
	replace wirtschaftszweige = 7 if inrange(w93_3,501,527)
	replace wirtschaftszweige = 8 if inrange(w93_3,551,555)
	replace wirtschaftszweige = 9 if inrange(w93_3,601,642)
	replace wirtschaftszweige = 10 if inrange(w93_3,651,672)
	replace wirtschaftszweige = 11 if inrange(w93_3,701,748)
	replace wirtschaftszweige = 12 if inrange(w93_3,751,753)
	replace wirtschaftszweige = 13 if inrange(w93_3,801,804)
	replace wirtschaftszweige = 14 if inrange(w93_3,851,853)
	replace wirtschaftszweige = 15 if inrange(w93_3,900,930)
	replace wirtschaftszweige = 16 if inlist(w93_3,950)
	replace wirtschaftszweige = 17 if inlist(w93_3,990)

	# delimit ;
	label define wz_codes
		0 "Undefiniert"
		1 "Land-_Forstwirtschaft"
		2 "Fischerei_Fischzucht"
		3 "Bergbau_Gewinnung_v_Steinen_Erden"
		4 "Verarbeitendes_Gewerbe"
		5 "Energie-_Wasserversorgung"  
		6 "Baugewerbe"
		7 "Handel_Instandhaltung_Reperatur_v_Kraftfahrzeugen"
		8 "Gastgewerbe"
		9 "Verkehr_Nachrichtenübermittlung"
		10 "Kredit-_Versicherungsgewerbe"
		11 "Immobilienbranche"
		12 "Öffentl_Verwaltung_Verteidigung_Sozialversicherung"
		13 "Erziehung_Unterricht"
		14 "Gesundheits-_Veterinär-_Sozialwesen"
		15 "Erbringung_v_sonstigen_öffentl_persönl_Dienstleistungen"
		16 "Priv_Haushalte"
		17 "Exterritoriale_Organisationen"
		;
	# delimit cr
	label values wirtschaftszweige wz_codes
	}

	if ${iab}==1 {
		* Gen categorical industry variable 
		gen wirtschaftszweige = 0
		replace wirtschaftszweige = 1 if inrange(wz08,1000,3999)
		replace wirtschaftszweige = 2 if inrange(wz08,5000,9999)
		replace wirtschaftszweige = 3 if inrange(wz08,10000,12999)
		replace wirtschaftszweige = 4 if inrange(wz08,13000,15999)
		replace wirtschaftszweige = 5 if inrange(wz08,16000,18999)
		replace wirtschaftszweige = 6 if inrange(wz08,19000,19999)
		replace wirtschaftszweige = 7 if inrange(wz08,20000,20999)
		replace wirtschaftszweige = 8 if inrange(wz08,21000,21999)
		replace wirtschaftszweige = 9 if inrange(wz08,22000,23999)
		replace wirtschaftszweige = 10 if inrange(wz08,24000,25999)
		replace wirtschaftszweige = 11 if inrange(wz08,26000,26999)
		replace wirtschaftszweige = 12 if inrange(wz08,27000,27999)
		replace wirtschaftszweige = 13 if inrange(wz08,28000,28999)
		replace wirtschaftszweige = 14 if inrange(wz08,29000,30999)
		replace wirtschaftszweige = 15 if inrange(wz08,31000,33999)
		replace wirtschaftszweige = 16 if inrange(wz08,35000,35999)
		replace wirtschaftszweige = 17 if inrange(wz08,36000,39999)
		replace wirtschaftszweige = 18 if inrange(wz08,41000,43999)
		replace wirtschaftszweige = 19 if inrange(wz08,45000,47999)
		replace wirtschaftszweige = 20 if inrange(wz08,49000,53999)
		replace wirtschaftszweige = 21 if inrange(wz08,55000,56999)
		replace wirtschaftszweige = 22 if inrange(wz08,58000,60999)
		replace wirtschaftszweige = 23 if inrange(wz08,61000,61999)
		replace wirtschaftszweige = 24 if inrange(wz08,62000,63999)
		replace wirtschaftszweige = 25 if inrange(wz08,64000,64999)
		replace wirtschaftszweige = 26 if inrange(wz08,68000,68999)
		replace wirtschaftszweige = 27 if inrange(wz08,69000,71999)
		replace wirtschaftszweige = 28 if inrange(wz08,72000,72999)
		replace wirtschaftszweige = 29 if inrange(wz08,73000,75999)
		replace wirtschaftszweige = 30 if inrange(wz08,77000,82999)
		replace wirtschaftszweige = 31 if inrange(wz08,84000,84999)
		replace wirtschaftszweige = 32 if inrange(wz08,85000,85999)
		replace wirtschaftszweige = 33 if inrange(wz08,86000,86999)
		replace wirtschaftszweige = 34 if inrange(wz08,87000,88999)
		replace wirtschaftszweige = 35 if inrange(wz08,90000,93999)
		replace wirtschaftszweige = 36 if inrange(wz08,94000,96999)
		replace wirtschaftszweige = 37 if inrange(wz08,97000,98000)
		replace wirtschaftszweige = 38 if inrange(wz08,99000,99999)

		# delimit ;
		label define wz_codes
			0 "Undefiniert"
			1  "Landwirtschaft_Forstwirtschaft_Fischerei"
			2  "Bergbau_Gewinning_v_Steinen_Erden"
			3  "Herstellung_v_Nahrungs-_und_Genussmitteln,_Getränken_und_Tabakerzeugnissen"
			4  "Herstellung_v_Textilien,_Bekleidung,_Leder,_Lederwaren_und_Schuhen"
			5  "Herstellung_v_Holzwaren,_Papier,_Pappe_und_Waren_daraus,_Herstellung_v_Druckerzeugnissen"
			6  "Kokerei_und_Mineralölverarbeitung"
			7  "Herstellung_v_chemischen_Erzeugnissen"
			8  "Herstellung_v_pharmazeutischen_Erzeugnissen"
			9  "Herstellung_v_Gummi_und_Kunststoffwaren_sowie_v_Glas_und_Glaswaren,_Keramik,_Verarbeitung_v_Steinen_und_Erden"
			10 "Metallerzeugung_und_-bearbeitung,_Herstellung_v_Metallerzeugnissen"
			11 "Herstellung_v_Datenverarbeitungsgeräten,_elektronischen_und_optischen_Erzeugnissen"
			12 "Herstellung_v_elektrischen_Ausrüstungen"
			13 "Maschinenbau"
			14 "Fahrzeugbau"
			15 "Sonstige_Herstellung_v_Waren,_Reperatur_und_Installation_v_Maschinen_und_Ausrüstung"
			16 "Energieversorgung"
			17 "Wasserversorgung;_Abwasser-_und_Abfallentsorgung_und_Beseitigung_v_Umweltverschmutzungen"
			18 "Baugewerbe"
			19 "Handel;_Instandhaltung_und_Reperatur_v_KFZ"
			20 "Verkehr_und_Lagerei"
			21 "Gastgewerbe"
			22 "Verlagswesen,_audiovisuelle_Medien_und_Rundfunk"
			23 "Telekommunikation"
			24 "Informationstechnologische_und_Informationsdienstleistungen"
			25 "Erbringung_v_Finanz-_und_Verkehrsdienstleistungen"
			26 "Grundstücks-_und_Wohnungswesen"
			27 "Erbringung_v_freiberuflichen_und_technischen_Dienstleistungen"
			28 "Wissenschaftliche_Forschung_und_Entwicklung"
			29 "Sonstige_freiberufliche,_wissenschaftliche_und_technische_Tätigkeiten"
			30 "Erbringung_v_sonstigen_wirtschaftlichen_Dienstleistungen"
			31 "Öffentliche_Verwaltung,_Verteidigung;_Sozialversicherung"
			32 "Erziehung_und_Unterricht"
			33 "Gesundheitswesen"
			34 "Heime_und_Sozialwesen"
			35 "Kunst,_Unterhaltung"
			36 "Sonstige_Dienstleistungen"
			37 "Private_Haushalte_mit_Hauspersonal_Haushaltsnahe_DL"
			38 "Exterritoriale_Organisationen_und_Körperschaften"
			;
		# delimit cr
		label values wirtschaftszweige wz_codes
	}
	tab wirtschaftszweige if firstpers==1 & wirtschaftszweige>0, m	
	tab wirtschaftszweige if lastpers==1 & wirtschaftszweige>0, m	

	
/* ------------------------------------------------------------------------ */
/* (8) Parallel spells & imputing social benefits */
/* ------------------------------------------------------------------------ */

* NB. wage deflation below, as we need nominal values to establish "Dazuverdiener" rules etc.

/* 	Outline                                                                 */
	* (8.a) Extra-payments: Add to wage of non-extra payment spell
	* (8.b) Identify parallel spells 
	* (8.c) Eliminate parallel spells
	* (8.d) Including income from social benefits
	* (8.e) Deflating incomes & Imputing above assessment ceiling

	/* ------------------------------------------------------------------------ */
	* (8.a) Clear spells of extra payment 
	/* ------------------------------------------------------------------------ */
		/* Comment:	 
		   Extrapayment spells are cleared by apportioning them to existing employment spells from the same employer (identified by betnr).
		   That is, we divide them among other spells at the same employer and the same year and then simply drop them. */
	/* ------------------------------------------------------------------------ */
	
	* Identifying spells of extrapayment
	sort persnr begepi
	gen extrapay = 0
	replace extrapay = 1 if grund == 154 
	
		* Calculating the duration of the spells in days
		gen dur = (endepi - begepi) + 1
		sum dur if grund == 154, detail 
								* Analysing the duration of extrapayment spells
								* three-quarter of extrapayment spells are one month (Christmas money?)

		* Capture the number of spells by person, firm, and year
		cap drop jahrend
		gen jahrend = year(endepi)
		egen extrapay_ID = group(persnr betnr jahrend)
		replace extrapay_ID = . if extrapay == 1
		gen extrapay_num_help = 1 if extrapay_ID != .
		bysort extrapay_ID: egen extrapay_num = total(extrapay_num_help)
		drop jahrend extrapay_ID extrapay_num_help
	
	* Calculating the earnings of the extrapayment spells 
	gen extrapayment = tentgelt*dur if grund == 154
	
	* Calculating the sum of all extrapayment earnings per person, establishment, and year
	bysort persnr betnr jahr: egen sum_extrapayment = total(extrapayment)
	
	* Calculating the sum of total days worked at an employer without extra-payment spells
	bysort persnr betnr jahr: egen dur_empl = total(dur) if grund != 154
	
	* Calculating the daily amount of extra-payment
	gen daily_extrapayment_empl = (sum_extrapayment/dur_empl)*(1/extrapay_num)
	
		* Check the results
		gen daily_extrapayment_empl_help = (sum_extrapayment/dur_empl)
		gen daily_extrapayment_empl_help2 = extrapay_num * daily_extrapayment_empl
		sum daily_extrapayment_empl_help2 if daily_extrapayment_empl_help2 > 0
		sum daily_extrapayment_empl_help if daily_extrapayment_empl_help > 0 // They should give the same mean
		drop daily_extrapayment_empl_help daily_extrapayment_empl_help2
	 
	* Add the daily extrapayment wage to the wage of non-extra-payment spells
	* extra payments (at least using this definition) appear not very important
		
		replace tentgelt = tentgelt + daily_extrapayment_empl if grund != 154 & daily_extrapayment_empl != .
	
	* Check extrapayment spells that can not be apportioned to existing employments (since there is no appropraite employment)
		bysort persnr betnr jahr: egen dur_empl_help = max(dur_empl) // Here we transfer the information about employment duration (NOT in extrapay spells) to extrapayment spells
		
		gen extrapay_unmatched = 0
		replace extrapay_unmatched = 1 if grund==154 & dur_empl_help<=0 // "dur_empl_help" gives the number of days worked at the same employer and within the same year. 
																	   // If there is a missing, we can not apportion this extrapayment to another employment.
		* Check unmatched extrapayment spells
		tab extrapay_unmatched
		sum tentgelt if extrapay_unmatched == 1 // extrapay_unmatched == 1 for those spells of extrapayment that can not be apportioned to other existing employment spells
		cap hist tentgelt if extrapay_unmatched == 1
	
		* Drop all extrapayment spells
		drop if grund == 154
	 
	*******************************************************************************		
	* (8.b) Identify parallel spells 
	*******************************************************************************		
	
	* Identify parallel spells by person ID (persnr) and start date of episode (begepi)
	
		sort persnr begepi
		gen parallel = 0
		replace parallel = 1 if persnr==persnr[_n+1] & begepi==begepi[_n+1]
		replace parallel = 1 if persnr==persnr[_n-1] & begepi==begepi[_n-1]

	*******************************************************************************		
	* (8.c) Eliminate parallel spells
	*******************************************************************************		
		* Treatment of characteristic parallel spells combinations
		
			/* Comment on income information and parallel spells:

			After the treatment here we should have one single main spell per person at a certain time. 
			1 - the variable "tentgelt" will contain the income information from this spell only.

			We want to ensure not to lose income information when dropping parallel spells. 
			2 - We seperately save the sum of income from margemp/UE-benefit/ALMP-benefit/miscemp by persnr (personal ID) 

			3 - => By substracting tentgelt of the main spell from the suminc-variables, 
			we further identify extrainc-information, that is income from sources other than the main spell. */					

	*******************************************************************************		
	* (8.c.i) Deparallelizing Miscallaneous (unclear) spells	
	*******************************************************************************		

		/*	Rules to clear parallel misc spells:
				* 8.c.i.1) If a misc spell is paralleled by a non-misc spell, it is simply dropped. Income from misc is saved seperately.
				* 8.c.i.2) If a misc spell is paralleled only by other misc spells, we first prioritize lignite spells and thereafter we clear by highest tentgelt.
				* 8.c.i.3) If a misc spell is not paralleled, we keep it without preparation. */

	* Identify the min and max status of parallel spells by persnr and begepi
		egen max_status = max(status), by(persnr begepi)
		egen min_status = min(status), by(persnr begepi)
		
	* Seperately keep the sum of income from misc spells
			gen suminc_Z_help = 0
			replace suminc_Z_help = tentgelt if status == -99 // Note here that "-99" is the code for miscellaneous spells.
			egen suminc_Z = sum(suminc_Z_help), by(persnr begepi)
			drop suminc_Z_help

			replace suminc_Z = suminc_Z * tentgeltdays // transform daily pay into monthly pay.
			label variable suminc_Z "Sum of tentgelt from miscellaneous spells (monthly)"
			
	***************************************************************************
	*** (8.c.i.1) Drop misc spells that are paralleld by nonmisc spells   ******
	***************************************************************************
			// Here we drop misc spells that are paralleled by at least one nonmisc spell. 
			// min_status will automatically be "-99" and the max_status is defined to be greater.		
	
			drop if status == -99 & (max_status > min_status) & !mi(max_status) & !mi(min_status) 
	
	***********************************************************************************************************************
	* (8.c.i.2) Drop non-lignite misc spells paralleled by lignite misc spells & thereafter we clear by highest tentgelt.
	***********************************************************************************************************************

			* Here we drop misc spells that are 
			*	(a) not themselves lignite
			*	(b) paralleled only by other misc spells 
			*	(c) at least one of these other misc spells is lignite
			drop if status == -99 & (max_status == min_status) & parallel == 1 & thisspelllignite == 0 & somelignite == 1 
			
			* Check the results
			if ${iab}==1 {
			// We should have only lignite spells here:
			tab wz08 if status == -99 & max_status == min_status & parallel == 1 & somelignite == 1       
			// There should be no lignite spell here:
			tab wz08 if status == -99 & max_status == min_status & parallel == 1 & somelignite == 0, sort 
			}				
		
			/* Drop remaining parallel miscallaneous spells
			Of parallel misc spells, we should now only have parallel lignite misc spells
			Clear these by tentgelt.
			Keep only the misc spells with highest tentgelt if several misc spells
			(identified by the fact that the max and min of status equals "-99"). 
			*/
		bys persnr begepi: g deldup=_n if (min_status == max_status) & (status == -99)
		tab deldup 
		gsort persnr begepi status -tentgelt
		bys persnr begepi: g deldup2=_n  if status == -99 & (min_status == max_status)
		drop if deldup2>1 & !mi(deldup)
		drop deldup
		
	*******************************************************************************		
	* (8.c.ii) Deparallelizing NALO spells	
	*******************************************************************************				
	* Treatment of parallel NALOs (individuals who are not unemployed but registered as searching a new job)
	* If NALO spells are paralleled by spells other than NALO, the NALO spell is simply dropped.
	* Although NALO means there is alternative employment, for some NALO spells we have no parallel other spell information.
			
			* Redefine parallel
			sort persnr begepi
			cap drop parallel
			gen parallel = 0
			replace parallel = 1 if persnr==persnr[_n+1] & begepi==begepi[_n+1]
			replace parallel = 1 if persnr==persnr[_n-1] & begepi==begepi[_n-1]

			drop if status == -4 & min_status == -4 & max_status > -4 & parallel==1
				
	*******************************************************************************		
	* (8.c.iii) Summing margemp & normemp incomes & re-categorizing normemp & margemp
	*******************************************************************************				

				* Sum regular income to "suminc_R"
				gen suminc_R_help =tentgelt if (status == 3 | status == 4) 
				egen suminc_R = sum(suminc_R_help), by(persnr begepi) 
				// Sum tentgelt from "regular" employment (== normemp and vocational spells)
				replace suminc_R = suminc_R * tentgeltdays
				label variable suminc_R "Sum of tentgelt from different normalemp (FT/PT) & vocational training (monthly)"
				drop suminc_R_help

				* Seperately keep the sum of income from marginal employment
				gen suminc_M_help = 0
				replace suminc_M_help = tentgelt if status == 2 
				egen suminc_M = sum(suminc_M_help), by(persnr begepi)
				drop suminc_M_help
				replace suminc_M = suminc_M * tentgeltdays
				label variable suminc_M "Sum of tentgelt from marginal employment (daily)"
								
	*******************************************************************************		
	* (8.c.iv) Deparallelizing Dazuverdiener (unemp & margemp)
	*******************************************************************************				
				/* Comment: 
				* Identify Dazuverdiener and classify as unemployed. Define Dazuverdiener as
				* (1) Unemployed individuals; (2) with marginal employment (3) if income lower than "margempee". 

				Rules to clear Dazuverdiener spells:
				* We treat Dazuverdiener as being unemployed, that is we want to drop the parallel margemp as well as the potential 
				  parallel UE-benefit. In the end, we only want to have a single unemployment spell, while seperately saving the income 
				  from UE-benefit and the margemp.
				* Since the employment status of Dazuverdiener depends on how much they "regularly" earn besides the UE benefits, 
				  we first track Dazuverdiener, then we save benefit income information based on the Dazuverdiener dummy,
				  before finally dropping parallel spells casewise.*/
	
				* Redefine highest/lowest status
				drop min_status max_status
				egen max_status = max(status), by(persnr begepi)
				egen min_status = min(status), by(persnr begepi)

				* Identify ALMP participants, as we want to exclude them here	
				* This chapter ignores those who may participate in an ALMP measure. 
				* ALMP measures (single or paralleled by other spells) are treated seperately below.
				gen almp_part_help = 0
				replace almp_part_help = 1 if (status == -3 | status == 1) & parallel == 1
				bysort persnr begepi: egen almp_part = max(almp_part_help)
				drop almp_part_help

				gen dazuverdiener = 0
				forvalues t=1975/2017{
				bysort persnr begepi: replace dazuverdiener = 1 if (min_status == -2 | min_status == -1 | min_status == 0) & max_status == 2 & almp_part == 0 & suminc_M <= ${margempee_`t'} & jahrbeg ==`t'
				}		
				* Check the results
				tab status if dazuverdiener == 1
				tab status if dazuverdiener == 1 & status == min_status // There should only be unemp (benefit) spells (status == -2/-1/0). 
		
				* Seperately keep income from ALMP 
				gen suminc_A_help = 0
				replace suminc_A_help = tentgelt if (status == -3 | status == 1) 
				egen suminc_A = sum(suminc_A_help), by(persnr begepi)
				drop suminc_A_help
				replace suminc_A = suminc_A * tentgeltdays
				label variable suminc_A "Sum of tentgelt from ALMP (daily)"
				
				* Seperately keep income from ALG1 benefit if *NOT* Dazuverdiener
				* Note tentgelt only provides information from LeH not from MTH & LHG (see (7d))
				gen suminc_B_help = 0
				replace suminc_B_help = tentgelt if (status == -2 | status == -1) & dazuverdiener == 0 
				egen suminc_B = sum(suminc_B_help), by(persnr begepi)
				drop suminc_B_help
				replace suminc_B = suminc_B * tentgeltdays
				label variable suminc_B "Sum of tentgelt from ALG1/ALG2 benefit if ~Dazuverdiener (daily)"

				* Seperately keep income from ALG1 benefit if Dazuverdiener
				gen suminc_D_help = 0
				replace suminc_D_help = tentgelt if (status == -2 | status == -1) & dazuverdiener == 1 
				egen suminc_D = sum(suminc_D_help), by(persnr begepi)
				drop suminc_D_help
				replace suminc_D = suminc_D * tentgeltdays
				label variable suminc_D "Sum of tentgelt from ALG1/ALG2 benefit if Dazuverdiener (daily)"

				* Drop margemp spells of unemp benefit if dazuverdiener				
				drop if dazuverdiener==1 & (status==2)
				drop if dazuverdiener==1 & (status==-1 & min_status==-2)
				replace status = 0 if dazuverdiener == 1 
			
				/* Note there may be individuals that receive unemp benefits without being parallely registered as unemployed. For these individuals, 
				we may first keep the benefit spell and afterwards need to change the status to "registered being unemployed" (status == 0).*/
				
				* Correct income information for Dazuverdiener
				* replace tentgelt = suminc_D if dazuverdiener == 1 // As we assign Dazuverdiener the status "unemployed" the unemployment benefit represents their "regular" income
			
				* Check the results
				tab status if dazuverdiener == 1 // We would expect here to only have unemployment spells, though some of which may be parallel.
				sum suminc_B if dazuverdiener == 1 // There should be no income information here.
				sum suminc_B if dazuverdiener == 0 // There should be income information here.
				sum suminc_D if dazuverdiener == 1 // There should be income information here.
				sum suminc_D if dazuverdiener == 0 // There should be no income information here.
				sum suminc_M if dazuverdiener == 1 // These should be within the range 0 <= inc <= ${margempee}
		
		*********************************************************************
		* (8.c.v) Treatment of short-term small overlaps (short term (<= 5 days) 
		*        (remove small parallel spells)
		*********************************************************************
			* Small overlaps are defined as being smaller than or equal to 5 days (define global/macro to change at one point in code). 

			* 8.c.v.1) Parallel employment/ALMP measure and unemployment)
			* We assume the unemp notification to be incorrect and instead assign the employment/ALMP status.
			
			* Redefine highest/lowest status & parallel
			cap drop min_status max_status parallel
			egen max_status = max(status), by(persnr begepi)
			egen min_status = min(status), by(persnr begepi)
			sort persnr begepi
			gen parallel = 0
			replace parallel = 1 if persnr==persnr[_n+1] & begepi==begepi[_n+1]
			replace parallel = 1 if persnr==persnr[_n-1] & begepi==begepi[_n-1]
			
			* Define length of spells in days
			gen spell_length = endepi - begepi
						
			* Identify small overlaps
			gen small_overlaps = 0
			bysort persnr begepi: replace small_overlaps = 1 if (min_status == -2 | min_status == -1 | min_status == 0) & (max_status == 1 | max_status == 2 | max_status == 3 | max_status == 4) & spell_length <= ${small_overlaps} & parallel == 1
			
			* Check the results
			tab status if small_overlaps == 1
			bys persnr begepi: g deldup=_n if small_overlaps == 1
			tab deldup
			drop del*
			
			bysort persnr begepi: drop if small_overlaps == 1 & (status < max_status)
			bysort persnr begepi: drop if small_overlaps == 1 & somelignite == 1 & thisspelllignite == 0 
	
		* If, after dropping the unemp spells as well as possible "lower status" employments, 
		* there remain parallel spells that include at least one lignite spell, we only keep the lignite spell(s)

			* Redefine parallel & highest tentgelt within stack of parallel spells
			sort persnr begepi
			drop parallel
			gen parallel = 0
			replace parallel = 1 if persnr==persnr[_n+1] & begepi==begepi[_n+1]
			replace parallel = 1 if persnr==persnr[_n-1] & begepi==begepi[_n-1]
			egen max_tentgelt = max(tentgelt), by(persnr begepi)
				
			bysort persnr begepi: drop if small_overlaps == 1 & parallel == 1 & (tentgelt < max_tentgelt)        & !mi(max_tentgelt)  // If the remaining lignite spells of the same status have the same tentgelt, we clear by the highest tentgelt.
			bysort persnr begepi: drop if small_overlaps == 1 & parallel == 1 & mi(tentgelt) & !mi(max_tentgelt) & !mi(max_tentgelt) // If the remaining lignite spells of the same status have the same tentgelt, we clear by the highest tentgelt.	

			* Check the results
				tab status if small_overlaps == 1 // There should be no unemployment spells here
				sum suminc_B if small_overlaps == 1 // There should be income information here
				sum suminc_D if small_overlaps == 1 // There should be no income information here
		
		*********************************************************************
		* (8.c.vi) Treatment of unemployed minijobber (with large minijobs)
		*********************************************************************
		* Treatment of individuals registered as unemp but with margemp
		* *above* the income threshold (below income thresh = unemployed
		* Here, we do not treat ALMP spells. ALMP spells are treated seperately below.			
		
			* Redefine highest/lowest status
			drop min_status max_status
			egen max_status = max(status), by(persnr begepi)
			egen min_status = min(status), by(persnr begepi)
			
			* Identify unemployed Minijobber
			gen mini_ue = 0
			forvalues t=1975/2017{
			bysort persnr begepi: replace mini_ue = 1 if (min_status == -2 | min_status == -1 | min_status == 0) & max_status == 2 & almp_part == 0 & suminc_M > ${margempee_`t'} & jahrbeg ==`t' 
			}

			* Check: the population of unmployed minijobbers
			tab status if mini_ue == 1
			tab status if mini_ue == 1 & status == min_status
			tab status if mini_ue == 1 & status == max_status 
			// There should be at least as many marginal employments as unemployment observations from above. 
			* If there are more margemps, some individuals may have multiple margemps.
			* Check: This gives us the number of distinct values in persnr begepi within the group of unemployed minijobber:
			bys persnr begepi: g deldup=_n if mini_ue == 1
			tab deldup
			drop del*

			* Drop unemployed spell for these unemployed minijobbers
			bysort persnr begepi: drop if (status < max_status)  & mini_ue==1

			*Check: We would expect here to only have marginal employment spells, though some of which may be parallel.
			tab status if mini_ue == 1 
			bys persnr begepi: g deldup=_n if mini_ue == 1 // Check for duplicates
			tab del
			sum suminc_B if mini_ue == 1 // There should be income information here
			sum suminc_D if mini_ue == 1 // There should be no income information here
			sum suminc_M if mini_ue == 1 // These should be > ${margempee}
			drop del*

		*********************************************************************
		* (8.c.vii) Treatment of multi-minijobbers
		*********************************************************************
		/* NB (!) The two rules on Minijobbers have to run in this order to work! First, potential UE spells are dropped
		and if there still are several ME spells, the one with the highest tentgelt is kept as the "main" job. */
		
			* Redefine highest/lowest status
			drop min_status max_status
			egen max_status = max(status), by(persnr begepi)
			egen min_status = min(status), by(persnr begepi)
			
			* Redefine parallel
			sort persnr begepi
			drop parallel
			gen parallel = 0
			replace parallel = 1 if persnr==persnr[_n+1] & begepi==begepi[_n+1]
			replace parallel = 1 if persnr==persnr[_n-1] & begepi==begepi[_n-1]
			
			* Identify multi-minijobber
			bysort persnr begepi: gen mini_emp = 1 if (min_status == max_status) & min_status == 2 & parallel == 1 & almp_part == 0 & dazuverdiener == 0 & mini_ue == 0
			
				* Check the results
				tab status if mini_emp == 1
				*cap duplicates report persnr begepi if mini_emp == 1
				bys persnr begepi: g deldup=_n  if mini_emp == 1
				tab del
				drop del*
				sum suminc_A if mini_emp == 1 // There should be no income information here
				sum suminc_B if mini_emp == 1 // There should be no income information here
				sum suminc_D if mini_emp == 1 // There should be no income information here
				sum suminc_M if mini_emp == 1 // There should be income information here
			
			* Drop lower-tentgelt minijobs
				* Define the highest tentgelt by persnr and begepi
				cap drop max_tentgelt
				bys persnr begepi: egen max_tentgelt = max(tentgelt)
				replace max_tentgelt=0 if mi(max_tentgelt)

				* If there are several parallel spells and only one lignite spell, keep the lignite spell irrespective of tentgelt
				bysort persnr begepi: drop if (min_status == max_status) & min_status == 2 & parallel == 1 & somelignite == 1 & thisspelllignite == 0 & almp_part == 0 & dazuverdiener == 0 & mini_ue == 0

				* If there are non-lignite and lignite minijobs, drop the non-lignite minijobs
				bysort persnr begepi: drop if (min_status == max_status) & min_status == 2 & parallel == 1 & somelignite == 1 & thisspelllignite == 0 & almp_part == 0 & dazuverdiener == 0 & mini_ue == 0 // Drop the nonlignite spells
				* If there are several lignite minijobs, drop the minijobs with the lower tentgelt
				bysort persnr begepi: drop if (min_status == max_status) & min_status == 2 & parallel == 1 & somelignite == 1 & thisspelllignite == 1 & (tentgelt < max_tentgelt) & almp_part == 0 & dazuverdiener == 0 & mini_ue == 0 
				* If there are several lignite minijobs with the same tentgelt, randomly pick one
				bys persnr begepi tentgelt mini_emp thisspelllignite: g deldup=_n if mini_emp == 1 & thisspelllignite == 1
				drop if deldup>1 & deldup!=.
				drop deldup
				* If there are several nonlignite spells, keep the one with highest tentgelt, and if they have the same tentgelt randomly pick one
				bysort persnr begepi: drop if (min_status == max_status) & min_status == 2 & parallel == 1 & somelignite == 0 & (tentgelt < max_tentgelt) & almp_part == 0 
				by persnr begepi tentgelt mini_emp thisspelllignite: g deldup=_n if mini_emp == 1 & thisspelllignite == 0 // Randomly pick one of the identical nonlignite spells
				drop if deldup>1 & deldup!=.
				drop deldup max_tentgelt
				
			* Check the results
			tab status if mini_emp == 1 // We would expect here to only have marginal employment spells, though some of which may be parallel.
			*cap duplicates report persnr begepi if mini_emp == 1

		*********************************************************************
		* (8.c.viii) Treatment of Azubis with parallel spells
		*********************************************************************
		
			/* Prioritise vocational spells, as it is most likely the relevant one for the individual. It is possible that an individual
			is registered as parallely pursuing vocational education and a marginal or normal employment (register entry may depend on employer). */
		
				* Redefine highest/lowest status & parallel
				drop min_status max_status
				egen max_status = max(status), by(persnr begepi)
				egen min_status = min(status), by(persnr begepi)
				sort persnr begepi
				drop parallel
				gen parallel = 0
				replace parallel = 1 if persnr==persnr[_n+1] & begepi==begepi[_n+1]
				replace parallel = 1 if persnr==persnr[_n-1] & begepi==begepi[_n-1]
			
			* Identify Azubis with parallel spells
			gen parallel_azubi = 0
			bysort persnr begepi: replace parallel_azubi = 1 if (min_status < max_status) & (max_status==4) & parallel == 1
			
				* Check the results
				tab status if parallel_azubi == 1
				tab status if status == 4
				tab status if parallel_azubi == 0 & status == 4
				tab status if parallel_azubi == 1 & status == 4 // These tab commands help checking if parallel_azubi is mutually exclusive
				tab status if parallel_azubi == 1 & status < 4
				*Check: This gives us the number of distinct observations in persnr begepi within the group of parallel_azubi
				bys persnr begepi: g deldup=_n if parallel_azubi == 1
				tab deldup
				drop deldup
				
				* Drop all spells that are parallel to a vocational spell (max_status==4)
				bysort persnr begepi: drop if (status < max_status) & max_status == 4 & parallel == 1 
				
				* If there are several vocational spells, keep the one with the highest tentgelt
				* Define the highest tentgelt by persnr and begepi
				cap drop max_tentgelt
				bys persnr begepi: egen max_tentgelt = max(tentgelt)
				bysort persnr begepi: drop if (min_status < 4) & (max_status == 4) & (tentgelt < max_tentgelt) & parallel == 1
				drop max_tentgelt
				* If there are several vocational spells with the same tentgelt, randomly keep one of them
				bys persnr begepi tentgelt: g deldup=_n if parallel_azubi == 1
				tab deldup
				drop if deldup>1 & deldup!=.
				drop deldup
				
			*Check the results
			tab status if parallel_azubi == 1
			*Check: cap duplicates report persnr begepi if parallel_azubi == 1
			tab status if max_status == 4 & parallel_azubi == 0
			tab status if max_status == 4 & parallel == 0
			tab status if max_status == 4 & parallel == 1 & parallel_azubi == 1
			tab status if max_status == 4 
		
		*********************************************************************
		* (8.c.ix) Treatment of Aufstocker (unemployed + normal employment)
		*********************************************************************
		
				* Aufstocker are assigned the status of a normal employment. We save the income information from parallel spells (e.g. the UE benefit or 
				* even an additional marginal employment) in several seperate variables.

				* Redefine highest/lowest status & parallel
				drop min_status max_status
				egen max_status = max(status), by(persnr begepi)
				egen min_status = min(status), by(persnr begepi)
				sort persnr begepi
				drop parallel
				gen parallel = 0
				replace parallel = 1 if persnr==persnr[_n+1] & begepi==begepi[_n+1]
				replace parallel = 1 if persnr==persnr[_n-1] & begepi==begepi[_n-1]
			
			* Identify Aufstocker
			bysort persnr begepi: gen aufstocker = 1 if (min_status == -2 | min_status == -1 | min_status == 0) & max_status == 3 & almp_part == 0 
				
				* Check the results
				tab status if aufstocker == 1 // There should be no almp benefit or measure here
				tab status if aufstocker == 1 & status == max_status
				tab status if aufstocker == 1 & status == min_status
				bys persnr begepi: g deldup=_n if aufstocker == 1
				tab del
				drop del
		
				* Drop all unemployment & non-employment spells of aufstocker
				drop if (aufstocker==1) & (status == -2 | status == -1 | status == 0 | status == 2) 

				* For aufstocker with multiple jobs, prioritize (1) lignite; (2) highes tentgelt
				  *(1) Prioritize lignite spells
				drop if (min_status == -2 | min_status == -1 | min_status == 0) & max_status == 3 & almp_part == 0 & thisspelllignite == 0 & somelignite == 1
				
				 * (2) then tentgelt: 
				cap drop max_tentgelt
				bys persnr begepi: egen max_tentgelt = max(tentgelt)
				
				drop if (min_status == -2 | min_status == -1 | min_status == 0) & max_status == 3 & almp_part == 0 & (tentgelt < max_tentgelt)

				* (3) remaining: drop randomly 
				bys persnr begepi tentgelt: g deldup=_n if aufstocker == 1
				tab deldup
				drop if deldup>1 & deldup!=.
				drop deldup
				
			* Check the results
			tab status if aufstocker == 1
			* Check: cap duplicates report persnr begepi if aufstocker == 1
			tab status if max_status == 3 & aufstocker == 0
			tab status if max_status == 3 & parallel == 0
			tab status if max_status == 3 & parallel == 1 & aufstocker == 0
			tab status if max_status == 3
			
		*********************************************************************
		* (8.c.x) * Treatment of ALMP participants
		*********************************************************************
		
			/*  ALMPs are assigned a "-3" (ALMP benefit) and/or a "1" (ALMP measure) in status. It is, however, possible for an individual 
			to receive only the benefit or only the measure, seperately. Hence, we define an ALMP participant as an individual who either
			is inscribed to an ALMP measure, or who receives an ALMP benefit, or both (single & multiple), irrespective of the type of parallel spells. */
				
				* Redefine highest/lowest status & parallel
				drop min_status max_status
				egen max_status = max(status), by(persnr begepi)
				egen min_status = min(status), by(persnr begepi)
				drop parallel 
				sort persnr begepi
				gen parallel = 0
				replace parallel = 1 if persnr==persnr[_n+1] & begepi==begepi[_n+1]
				replace parallel = 1 if persnr==persnr[_n-1] & begepi==begepi[_n-1]
			
			* Identify ALMP participants
			drop almp_part 
			gen almp_part_help = 1 if (status == -3 | status == 1) & parallel == 1
			egen almp_part = max(almp_part_help), by(persnr begepi)
			drop almp_part_help
			
				* Check the results
				tab status if almp_part == 1 // There should be no vocational education here
				tab max_status if almp_part == 1 & min_status == -3
				tab min_status if almp_part == 1 & max_status == 1
				tab status if almp_part == 1 & status == max_status
				tab status if almp_part == 1 & status == min_status
				bys persnr begepi: g deldup=_n if almp_part == 1
				tab deldup
				drop deldup
																
			* Remove parallel spells of ALMP participants
			
				/**(1st rule) If ALMP benefit (-3) is paralleled only by unemp or NALO (-4|-2|-1|0), spell is treated as ALMP spell (1), unemp/NALO spells deleted. Income from the ALMP benefit is suminc_A.
						Potential combinations of parallel spells: -3 & (-2 | -1 | 0)
					  
					*(2nd Rule) If ALMP participation (1), delete other parallel spells.
					  & (receives ALMP benefit | (is unemp | receives UE benefit)), spell is treated as ALMP spell (1), other parallel spells deleted. Income from the ALMP benefit is suminc_A.
						Potential combination of parallel spells: (-3 | (-2 | -1) | 0) & 1
					  
					*(3rd Rule) If the highest parallel status is a "2" (that is the individual is almp & margemp), classify as ALMP & drop parallel spells 
						Potential combination of parallel spells: (-3 | 1) & ( / |-2 | -1 | 0) & 2
						
					*(4th rule) If ALMP is also normally employed (max status is 3), classify this person as normally employed
					  Income from normalemp is regular income, while income from the ALMP benefit is suminc_A. 
 						Potential combination of parallel spells: (-3 | 1) & ( / |-2 | -1 | 0) & 3 */		

						* (5th rule): delete (multiple)  parallel ALMP
				
				* (1st rule) Apply first rule
				drop                     if almp_part == 1 & (max_status == 0 | max_status == -1 | max_status == -2 | max_status == -3) & (status==-4 | status == -2 | status == -1 | status == 0)
				replace status = 1       if almp_part == 1 & (max_status == 0 | max_status == -1 | max_status == -2 | max_status == -3) &  status == -3 
									
				* (2nd rule) Apply second rule
					* Redefine highest/lowest status
					drop min_status max_status
					egen max_status = max(status), by(persnr begepi)
					egen min_status = min(status), by(persnr begepi)
					drop if almp_part == 1 & max_status == 1 & (status == -4 | status == -3 | status == -2 | status == -1 | status==0)
				
				* (3rd rule) Apply third rule
						* Redefine highest/lowest status & parallel
						drop min_status max_status parallel
						egen max_status = max(status), by(persnr begepi)
						egen min_status = min(status), by(persnr begepi)
						sort persnr begepi
						gen parallel = 0
						replace parallel = 1 if persnr==persnr[_n+1] & begepi==begepi[_n+1]
						replace parallel = 1 if persnr==persnr[_n-1] & begepi==begepi[_n-1]
				
					* Drop non-ALMP-spells of ALMP-participants whos highest status spell is margemp
						drop             if almp_part==1 & max_status == 2 & (status==-4 | status==-2 | status== -1 | status==0 | status==2) 
						replace status=1 if almp_part==1 & max_status == 2 & (status==-3) 
					
				* (4th rule) Apply fourth rule
						drop if almp_part == 1 & max_status == 3 & (status==-4 | status== -3 | status==-2 | status==-1 | status==0 | status==1 | status==2) 
				
				* Individual only assigned to almp benefits (status == -3) are classified as ALMP participants (status 1) */ 
					* Change status to almp measure of there are only parallel almp benefit spells
					* Redefine highest/lowest status & parallel
					drop min_status max_status parallel
					egen max_status = max(status), by(persnr begepi)
					egen min_status = min(status), by(persnr begepi)
					sort persnr begepi
					gen parallel = 0
					replace parallel = 1 if persnr==persnr[_n+1] & begepi==begepi[_n+1]
					replace parallel = 1 if persnr==persnr[_n-1] & begepi==begepi[_n-1]
				
					bysort persnr begepi: replace status = 1 if status == -3 & almp_part == 1
				
					* Check the results
					tab status if almp_part == 1
	
				* (5th rule) Drop multiple parallel ALMP measure spells
			
					* Redefine highest/lowest status 
					drop min_status max_status
					egen max_status = max(status), by(persnr begepi)
					egen min_status = min(status), by(persnr begepi)
				
					* Define the highest tentgelt by persnr and begepi
					cap drop max_tentgelt_help
					egen max_tentgelt_help = max(tentgelt), by(persnr begepi)
			
				* Keep the ALMP spell with the highest tentgelt
				bysort persnr begepi: drop if status==1 & parallel == 1 & (tentgelt != max_tentgelt_help)
				
				* If tentgelt is identical, randomly keep one 
				bys persnr begepi tentgelt: g deldup=_n if min_status == max_status & min_status == 1 & parallel == 1
				tab deldup
				drop if deldup>1 & deldup!=.
				drop deldup
										
		*********************************************************************
		* (8.c.xi) * Treatment of normalemp & margemp (no unemp)
		*********************************************************************
	
			* Redefine highest/lowest status & parallel
			drop min_status max_status parallel
			egen max_status = max(status), by(persnr begepi)
			egen min_status = min(status), by(persnr begepi)
			sort persnr begepi
			gen parallel = 0
			replace parallel = 1 if persnr==persnr[_n+1] & begepi==begepi[_n+1]
			replace parallel = 1 if persnr==persnr[_n-1] & begepi==begepi[_n-1]
			
		* Identify ppl with parallel normal emp & minijobs
			gen normemp_margemp = 0
			bysort persnr begepi: replace normemp_margemp = 1 if min_status == 2 & max_status == 3

		* record how many parallel spells affected
			bys persnr begepi: g deldup=_n if normemp_margemp == 1
			tab deldup
			drop del 
			
			* Clear normemp_margemp
			bysort persnr begepi: drop if (min_status == 2 & max_status == 3) & status == 2
		
			* only normalemp left now, although some of them may be parallel
			* tab status if normemp_margemp 
	
		****************************************************************************************************************************
		* (8.c.xii) * Treatment of parallel unemps (parallel combination of only unemp benefits (ALH/ALGI/ALGII) & unemp status (0)
		****************************************************************************************************************************

		* If there is a parallel combination of unemp and unemp benefits, we keep the unemp spell only.
		* If there are multiple unemp spells for whatever reason, we randomly pick one of them.
	
			* Redefine highest/lowest status & parallel
			drop min_status max_status parallel
			egen max_status = max(status), by(persnr begepi)
			egen min_status = min(status), by(persnr begepi)
			sort persnr begepi
			gen parallel = 0
			replace parallel = 1 if persnr==persnr[_n+1] & begepi==begepi[_n+1]
			replace parallel = 1 if persnr==persnr[_n-1] & begepi==begepi[_n-1]
	
		* Identify unemployed - we only have -2, -1 and 0 here
		gen unemp_only = 0
		bysort persnr begepi: replace unemp_only = 1 if (min_status == -2 | min_status == -1 | min_status == 0) & (max_status == -2 | max_status == -1 | max_status == 0) & parallel == 1
	
			* 1 - Keep the unemp spell (if it exists)
				drop if unemp_only==1 & (max_status==0 & status!=0) & parallel==1

			* 2 - for remaining multiple unemp spells, pick highest benefit
				* Redefine highest/lowest status & parallel
				drop min_status max_status parallel
				egen max_status = max(status), by(persnr begepi)
				egen min_status = min(status), by(persnr begepi)
				sort persnr begepi
				gen parallel = 0
				replace parallel = 1 if persnr==persnr[_n+1] & begepi==begepi[_n+1]
				replace parallel = 1 if persnr==persnr[_n-1] & begepi==begepi[_n-1]

				* Define the highest tentgelt by persnr and begepi
				drop max_tentgelt_help
				egen max_tentgelt_help = max(tentgelt), by(persnr begepi)

				* Keep the benefit spell with the highest tentgelt
				cap drop delme
				drop if unemp_only == 1 & (tentgelt < max_tentgelt_help) & !missing(tentgelt) & parallel==1
				drop max_tentgelt_help
			
				* Reassign status "unemployed" 
				replace status = 0 if unemp_only == 1
			
				* If tentgelt is missing, randomly keep one observation 
				bys persnr begepi: g deldup=_n if unemp_only == 1
				tab deldup
				drop if deldup>1 & deldup!=.
				drop deldup	
		
			* Check the results: we expect the same number of unemployment observations as above
			tab status if unemp_only == 1 
	
	**************************************************************
	* (8.c.xiii) Treatment of parallel spells of the same status
	**************************************************************
	
		/* At this point in the code, all remaining parallel spells are of same status (!)
		
			(1) Vocational, normalemp and margemp:
				(1a)* If there is a single lignite spell, delete all other parallels
				(1b)* If there are multiple lignite spells or no lignite spells, delete all except highest tentgelt
			
			ALMP - these have been removed above 
			NALOS - these will be deleted further on			*/
	
			* Redefine parallel
			
			sort persnr begepi
			cap drop parallel
			gen parallel = 0
			replace parallel = 1 if persnr==persnr[_n+1] & begepi==begepi[_n+1]
			replace parallel = 1 if persnr==persnr[_n-1] & begepi==begepi[_n-1]
			
			* DESCRIPTIVES on parallel spells of same status
			bysort persnr begepi status: g parallel_count1 = _n
			bysort persnr begepi status: egen max_parallel_count1 = max(parallel_count1)
			g parallel_check1 = (parallel_count1 > 1)
			tab status if parallel_check1 == 1
			
			* DESCRIPTIVES on parallel spells of same status & same tentgelt
			bysort persnr begepi status tentgelt: g parallel_count2 = _n
			bysort persnr begepi status: egen max_parallel_count2 = max(parallel_count2)
			g parallel_check2 = (parallel_count2 > 1)
			tab status if parallel_check2 == 1
			tab parallel_check1 parallel_check2
				
		*(1a) Drop non-lignite spells paralleled by lignite spells (all of same status)
			bysort persnr begepi status: drop if thisspelllignite == 0 & somelignite == 1 & parallel_check1 == 1 
			
		*(1b) If there are multiple lignite spells, delete all except highest tentgelt
			* Redefine parallel_check
			drop parallel_check*
			gen parallel_check = 0
			bysort persnr begepi status: replace parallel_check = 1 if (status == status[_n+1] | status == status[_n-1])
			* Sort by tentgelt by persnr and begepi
			
			gsort persnr begepi status -tentgelt
			by persnr begepi status: g deldup=_n
			drop if deldup>1 & !mi(deldup)
		
			/* This step drops duplicates in persnr begepi status by tentgelt 
			(and randomly drops obs with same tentgelt) */

		****************************************
		*  Drop few remaining parallel spells  *
		****************************************
					
			* Check the results
			* duplicates report persnr begepi
			* duplicates report persnr begepi status
			* duplicates report persnr begepi status tentgelt 
			* Ideally, the three report should all be identical. 
			* That is, all exisiting parallel spells have the same status and the same tentgelt and are, for our purposes, duplicates.
		
			/* Comment:
			A last part of preparation is concerned with parallel spells that were not treated so far. */

			* Redefine highest/lowest status & parallel
			drop min_status max_status parallel
			egen max_status = max(status), by(persnr begepi)
			egen min_status = min(status), by(persnr begepi)
			sort persnr begepi
			gen parallel = 0
			replace parallel = 1 if persnr==persnr[_n+1] & begepi==begepi[_n+1]
			replace parallel = 1 if persnr==persnr[_n-1] & begepi==begepi[_n-1]

		*cap duplicates report persnr begepi // We would expect here to have no more duplicates
		duplicates tag persnr begepi, gen(rest_parallel)
		tab status if rest_parallel == 1
		tab min_status max_status if rest_parallel == 1
		
		cap drop deldup
		bys persnr begepi: g deldup=_n
		tab deldup
		drop if deldup>1 & deldup!=.
		drop deldup
	
	* Reallocate the status of non-parallel benefit or ALMP spells
	replace status = 0 if status == -2 | status == -1 // Reallocate unemployment status to unemployment benefit spells
	replace status = 1 if status == -3 // Reallocate ALMP participation status to ALMP benefit spells
	
	* Drop non-parallel NALO and misc spells
		
	/* Comment:
	As we have no information on how to correctly deal with those spells, we drop them leaving black holes that may or may not be prepared below. */

	drop if (status == -4 | status == -99)

	drop parallel 
	sort persnr begepi
	gen parallel = 0
	replace parallel = 1 if persnr==persnr[_n+1] & begepi==begepi[_n+1]
	replace parallel = 1 if persnr==persnr[_n-1] & begepi==begepi[_n-1]
	tab parallel
	
	************************************************
	* (8.d) Including income from social benefits   *
	************************************************
	
		/* Note: As tentgelt only provides information from LeH and not from MTH & LHG, 
		we have to impute income information from social benefits like ALG II or housing benefits. 
		The imputation procedures differ since, for the housing benefit, 
		we add the average per capita benefit per state and year whereas*, 
		for ALG II, we add the official rates 
		at their respective effective dates. That is, for the housing benefit, 
		there is one particular rate for a certain year which is effective from 
		January 1 until December 31.
		For the ALG II in years 2006 to 2009, the effective starting date for 
		the new official rates was July 1.
		The procedure is based on predefined globals that contain all the 
		information on rates in a given year and state. Globals are defined in "00_master.do".
		* For further details on the calculation of these rates, please see the datadoc.*/
	
		* Unemployment benefit (ALG II)
			/* Note: From January 1, 2005, until June 30, 2007, there were 
			different benefit rates in the old (western) and new (eastern) 
			German states. To be able to correctly impute the rates, 
			we define two dummies to distinguish between the corresponding states.*/
		
		* Create variable wo_bula (Federal State level, NUTS-1)
		cap drop wo_bula
		gen wo_bula=int(ieb_wo_gem_num/1000000)
		label variable wo_bula "Wohnort Bundesland"
				
		* Create variables east/west
		gen west = 1 if inlist(wo_bula, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
		* Delme: Variable east has already been defined in iab==1 (?)
		cap gen east = 1 if inlist(wo_bula,11,12,13,14,15,16)
		*make sure we have zeros as well - these used in ieb_wages_prepare
		replace east=0 if inlist(wo_bula, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
		replace west=0 if inlist(wo_bula,11,12,13,14,15,16)
		
				* Count days for each year per episode
					/* Note: There may be spells or episodes that reach across two more more consecutive years. Since a respective household will receive different benefit rates during this time, 
					we have to scale down (and add later) the different rates according to their time shares at the whole spell or episode. Of course, this can only be applied for those years where
					the effective period of a benefit rate is from January 1 to December 31. Hence, for some years with the ALGII, we can not apply this simple procedure using a loop. */
				
					* Case 1: Begin & end in the same year
						/* Note: Here we count the number of days per year for spells or episodes that begin and end within the same year. */
						
					forvalues i = 2001/2017 {					
						gen days_`i' = 0
						replace days_`i' = endepi - begepi +1 if year(begepi) == `i' & year(endepi) == `i'						
						}
					
					* Case 2: Begin & end in consecutive years
						/* Note: Here we count the number of days per year for spells for episodes that do not begin and end within the same year. */
					forvalues i = 2001/2017 {
						* Count days for the first year of the spell/episode
						replace days_`i' = mdy(12,31,`i') - begepi + 1 if year(begepi) == `i' & year(endepi) > year(begepi) & mdy(12,31,`i') - begepi >= 0
						* Count days for fully enclosed years (accounting for leap years)
						replace days_`i' = 365 if year(begepi) < `i' & year(endepi) > `i' & inlist(`i',2001,2002,2003,2005,2006,2007,2009,2010,2011,2013,2014,2015,2017)
						replace days_`i' = 366 if year(begepi) < `i' & year(endepi) > `i' & inlist(`i',2004,2008,2012,2016)
						* Count days for the last year of the spell/episode
						replace days_`i' = endepi - mdy(1,1,`i') + 1 if year(endepi) == `i' & year(endepi) > year(begepi) & endepi - mdy(1,1,`i') >= 0
						}
				
					* Summarize total days
					gen days_total = 0
					forvalues i = 2001/2017 {
						replace days_total = days_total + days_`i'
						}
 					gen months_total = days_total/30.4375

			gen unempben = 0
			foreach region in east west {
			
			/* Note: To understand the following part, first, notice the effective starting dates for the years from 2005 to 2011:
						2005: January 1
						2006: July 1
						2007: July 1
						2008: July 1
						2009: July 1
						2011: January
			Note that the rates which effectively started at January 1, 2005, and July 1, 2009, were applicable for 18 months. The five different rates from 2005 to 2009 
			will have to be applied to two differnt years each (which is why we can not apply the simple procedure in a loop)*, depending on the timing of a spell/episode. 
			In order to do so, we have to distinguish four different cases.
			
						Case 1: Both begepi (start date of the spell/episode) and endepi (end date of the spell/episode) lie within the effective period of a particular rate
						Case 2: Only begepi lies within the effective period of a particular rate
						Case 3: Only endepi lies within the effective period of a particular rate
						Case 4: Both begepi and endepi lie outside the effective period of a particular rate
						
			Below, for each of the five rates, these four cases are treated in the same order as given above.
			
			* 	Further, due to the different effective starting and end dates of the five rates, it is also not possible to define a single seperate loop in order to correctly apply them. 
				It would, however, be possible to summarize the three rates 2006 to 2008 in a single loop (as they all start at July 1 and and end at June 30).
			*/
			
			* Year 2005/06
			replace unempben = unempben + (${unempben_2005_`region'}/30.4375)*(endepi - begepi + 1) 				if status == 0 & begepi >= mdy(1,1,2005) 	& begepi < mdy(7,1,2006)    & endepi >=  mdy(1,1,2005) 	& endepi <  mdy(7,1,2006) // Both inside
			replace unempben = unempben + (${unempben_2005_`region'}/30.4375)*(mdy(6,30,2006) - begepi + 1) 		if status == 0 & begepi >= mdy(1,1,2005) 	& begepi < mdy(7,1,2006) 							 	& endepi >= mdy(7,1,2006) // Upper bound outside
			replace unempben = unempben + (${unempben_2005_`region'}/30.4375)*(endepi - mdy(1,1,2005) + 1) 			if status == 0 & begepi <  mdy(1,1,2005) 							   	& endepi >=  mdy(1,1,2005) 	& endepi <  mdy(7,1,2006) // Lower bound outside
			replace unempben = unempben + (${unempben_2005_`region'}/30.4375)*(mdy(6,30,2006) - mdy(1,1,2005) + 1)  if status == 0 & begepi <  mdy(1,1,2005) 							   							 	& endepi >= mdy(7,1,2006) // Both outside
			
			* Year 2006/07
			replace unempben = unempben + (${unempben_2006_`region'}/30.4375)*(endepi - begepi + 1) 				if status == 0 & begepi >= mdy(7,1,2006) 	& begepi < mdy(7,1,2007)    & endepi >=  mdy(7,1,2006) 	& endepi <  mdy(7,1,2007)
			replace unempben = unempben + (${unempben_2006_`region'}/30.4375)*(mdy(6,30,2007) - begepi + 1) 		if status == 0 & begepi >= mdy(7,1,2006) 	& begepi < mdy(7,1,2007)								& endepi >= mdy(7,1,2007)
			replace unempben = unempben + (${unempben_2006_`region'}/30.4375)*(endepi - mdy(7,1,2006) + 1) 			if status == 0 & begepi <  mdy(7,1,2006) 								& endepi >=  mdy(7,1,2006)	& endepi <  mdy(7,1,2007)
			replace unempben = unempben + (${unempben_2006_`region'}/30.4375)*(mdy(6,30,2007) - mdy(7,1,2006) + 1) 	if status == 0 & begepi <  mdy(7,1,2006) 									  						& endepi >= mdy(7,1,2007)
			
			* Year 2007/08
			replace unempben = unempben + (${unempben_2007_`region'}/30.4375)*(endepi - begepi + 1) 				if status == 0 & begepi >= mdy(7,1,2007) 	& begepi < mdy(7,1,2008)    & endepi >=  mdy(7,1,2007)	& endepi <  mdy(7,1,2008)
			replace unempben = unempben + (${unempben_2007_`region'}/30.4375)*(mdy(6,30,2008) - begepi + 1) 		if status == 0 & begepi >= mdy(7,1,2007) 	& begepi < mdy(7,1,2008)								& endepi >= mdy(7,1,2008)
			replace unempben = unempben + (${unempben_2007_`region'}/30.4375)*(endepi - mdy(7,1,2007) + 1) 			if status == 0 & begepi <  mdy(7,1,2007)								& endepi >=  mdy(7,1,2007)	& endepi <  mdy(7,1,2008)
			replace unempben = unempben + (${unempben_2007_`region'}/30.4375)*(mdy(6,30,2008) - mdy(7,1,2007) + 1) 	if status == 0 & begepi <  mdy(7,1,2007) 															& endepi >= mdy(7,1,2008)
			
			* Year 2008/09
			replace unempben = unempben + (${unempben_2008_`region'}/30.4375)*(endepi - begepi + 1) 				if status == 0 & begepi >= mdy(7,1,2008) 	& begepi < mdy(7,1,2009)    & endepi >=  mdy(7,1,2008)	& endepi <  mdy(7,1,2009)
			replace unempben = unempben + (${unempben_2008_`region'}/30.4375)*(mdy(6,30,2009) - begepi + 1) 		if status == 0 & begepi >= mdy(7,1,2008) 	& begepi < mdy(7,1,2009)								& endepi >= mdy(7,1,2009)
			replace unempben = unempben + (${unempben_2008_`region'}/30.4375)*(endepi - mdy(7,1,2008) + 1) 			if status == 0 & begepi <  mdy(7,1,2008) 								& endepi >=  mdy(7,1,2008)	& endepi <  mdy(7,1,2009)
			replace unempben = unempben + (${unempben_2008_`region'}/30.4375)*(mdy(6,30,2009) - mdy(7,1,2008) + 1) 	if status == 0 & begepi <  mdy(7,1,2008) 															& endepi >= mdy(7,1,2009)
			
			* Years 2009-11
			replace unempben = unempben + (${unempben_2009_`region'}/30.4375)*(endepi - begepi + 1) 				if status == 0 & begepi >= mdy(7,1,2009) 	& begepi < mdy(1,1,2011)    & endepi >=  mdy(7,1,2009)	& endepi <  mdy(1,1,2011)
			replace unempben = unempben + (${unempben_2009_`region'}/30.4375)*(mdy(12,31,2010) - begepi + 1) 		if status == 0 & begepi >= mdy(7,1,2009) 	& begepi < mdy(1,1,2011)								& endepi >= mdy(1,1,2011)
			replace unempben = unempben + (${unempben_2009_`region'}/30.4375)*(endepi - mdy(7,1,2009) + 1) 			if status == 0 & begepi <  mdy(7,1,2009) 								& endepi >=  mdy(7,1,2009)	& endepi <  mdy(1,1,2011)
			replace unempben = unempben + (${unempben_2009_`region'}/30.4375)*(mdy(12,31,2010) - mdy(7,1,2009) + 1) if status == 0 & begepi <  mdy(7,1,2009) 															& endepi >= mdy(1,1,2011)
			}
			
				/* Note: For the remaining years we can apply a simple loop. */
			* Years 2011-2017
			forvalues i = 2011/2017 {
				foreach region in east west {
					replace unempben = unempben + (${unempben_`i'_`region'}/30.4375)*days_`i' if year(begepi) <= `i' & year(endepi) >= `i' & `region' == 1 & status == 0
					}
				}
			replace unempben = unempben * (30.4375/days_total)
		
		* Housing benefit
		gen houseben = 0
			forvalues i = 2001/2016 {
				forvalues n = 1/16 {
					replace houseben = houseben + ((${housben_`i'_`n'}/30.4375)*days_`i') if year(begepi) <= `i' & year(endepi) >= `i' & wo_bula == `n' & status == 0
					}
				}
				
	replace houseben = houseben * (30.4375/days_total)
	drop days*
	 

	****************************************************************
	* (8.e) deflating incomes & Imputing above assessment ceiling   *
	****************************************************************

* The program ieb_wages_prepare contains the consumer price index (variable cpi) used to deflate wages further in 1prepare
* We use the consumer price indices published by the OECD. The reference year is 2010.

	if ${iab}==1 {
	noisily do prog\ieb_wages_prepare.do
	
	* get from this new wage measure 
	* including deflated & imputed wages.
	replace tentgelt = tentgeltDefl if cens==0
	replace tentgelt = wage_imp     if cens==1
	label var tentgelt "new tentgelt, including imputed values, see 1prepare/ieb_wages_prepare"
	g tentgeltadded=0
	replace tentgeltadded=(wage_imp-tentgelt) if cens==1
	}
	
	if ${iab}==0 {
	gen cpi=.
	label variable cpi "Consumer Price Index"
	replace cpi = 54.5 if jahrbeg == 1975
	replace cpi = 56.8 if jahrbeg == 1976
	replace cpi = 58.9 if jahrbeg == 1977
	replace cpi = 60.5 if jahrbeg == 1978
	replace cpi = 63.0 if jahrbeg == 1979
	replace cpi = 66.4 if jahrbeg == 1980
	replace cpi = 70.6 if jahrbeg == 1981
	replace cpi = 74.3 if jahrbeg == 1982
	replace cpi = 76.7 if jahrbeg == 1983
	replace cpi = 78.6 if jahrbeg == 1984
	replace cpi = 80.2 if jahrbeg == 1985
	replace cpi = 80.1 if jahrbeg == 1986
	replace cpi = 80.3 if jahrbeg == 1987
	replace cpi = 81.3 if jahrbeg == 1988
	replace cpi = 83.6 if jahrbeg == 1989
	replace cpi = 85.8 if jahrbeg == 1990
	replace cpi = 89.0 if jahrbeg == 1991
	g tentgeltadded=0
	* without the IAB-data here no additional (imputed) variables 
	g tentgeltDefl=tentgelt*100/cpi
	replace tentgelt=tentgeltDefl
	g wage_preimp_permonth=tentgeltDefl*tentgeltdays
	g cens=0 
	* assume we observe all earnings fully in the test data
	}

/* ------------------------------------------------------------------------ */
/* (9) Collecting information on whole labour market biographies            */
/*     		(pre dropping of Black holes & series)                          */
/* 	   (9.a) income variable from all sources & wage growth (JAERE revision) */
/*     (9.b) specific lignite sector experience 							*/
/*     (9.c) Capture date of potential retirement (endepi of last spell)     */
/*     (9.d) Capture date of potential early retirement for ATZ cases	    */
/* ------------------------------------------------------------------------ */

/* ------------------------------------------------------------------------ */
	* (9.a.i) income variable encompassing income information from all sources
/* ------------------------------------------------------------------------ */
	* note that tentgelt is redefined to include imputet values - we now also 
	* a
	* Deflate income information using the following loop
	* cpi defined in ieb_wages_prepare (if $iab==1)

	replace unempben = 100 * unempben/cpi
	replace houseben = 100 * houseben/cpi
	replace suminc_A = 100 * suminc_A/cpi 
	replace suminc_B = 100 * suminc_B/cpi 
	replace suminc_D = 100 * suminc_D/cpi 
	replace suminc_M = 100 * suminc_M/cpi 
	* add "tentgeltadded", the extra imputed by the imputation algorithm
	* after deflation (it is deflated itself)
	* (only relevant for suminc_Z and suminc_R, 
	* other sumincs are benefits & minijob earnings, should not exceed assessment ceiling)
	replace suminc_Z = 100 * suminc_Z/cpi if cens==0
	replace suminc_Z = 100 * suminc_Z/cpi + tentgeltadded if cens==1
	replace suminc_R = 100 * suminc_R/cpi if cens==0
	replace suminc_R = 100 * suminc_R/cpi + tentgeltadded if cens==1
	* Gen a income variable that captures income out of all possible sources
	gen suminc_all = suminc_A + suminc_B + suminc_D + suminc_M + suminc_R + suminc_Z + unempben + houseben
	drop cpi
	
	* Check time consistency of income information (according to documentation, tentgelt changes signification pre-/post-1999 
	*  for data from certain source, switching from weekays to calendar days)
	graph twoway mband suminc_all jahr, xline(1998)
	graph twoway mband tentgelt jahr, xline(1998)
/* ------------------------------------------------------------------------ */
/*     (9.a.ii) 	* Wage growth over years (pre-collating spells) */
/* ------------------------------------------------------------------------ */
* EMP E2 for JAERE revision
g jahrend=year(endepi)
sort persnr begepi
g tentgrowlig=.
replace tentgrowlig=(tentgelt - tentgelt[_n-1])/tentgelt[_n-1] if persnr==persnr[_n-1] & jahrend==(jahrend[_n-1]+1) & (thisspelllignite==1 & thisspelllignite[_n-1]==1) & (tentgelt>0 & tentgelt[_n-1]>0)
replace tentgrowlig=. if tentgrowlig>3
label var tentgrowlig "percentage growth in wage year-to-year if in lignite"
drop jahrend
replace ageend = year(endepi) - year(geb_dat)
* (will be redefined in 3descriptives, but after spells collated)

bys persnr: egen tentgrowlig_twen = mean(tentgrowlig) if (ageend>=18 & ageend<30)
bys persnr: egen tentgrowlig_thir = mean(tentgrowlig) if (ageend>=30 & ageend<39)
bys persnr: egen tentgrowlig_four = mean(tentgrowlig) if (ageend>=40 & ageend<49)
bys persnr: egen tentgrowlig_five = mean(tentgrowlig) if (ageend>=50 & ageend<59)
bys persnr: egen tentgrowlig_sixp = mean(tentgrowlig) if (ageend>=60 & ageend<.)
					
label var tentgrowlig_twen "wage growth in lignite if aged 18-30"
label var tentgrowlig_thir "wage growth in lignite if aged 30-40"
label var tentgrowlig_four "wage growth in lignite if aged 40-50"
label var tentgrowlig_five "wage growth in lignite if aged 50-60"
label var tentgrowlig_sixp "wage growth in lignite if aged 60plus"

					
/* ------------------------------------------------------------------------ */
/*     (9.b) 		* Specific lignite sector experience */
/* ------------------------------------------------------------------------ */
	
		* Capture of experience only of lignite spells
			cap drop durlig
			gen durlig = 0
			replace durlig = (endepi - begepi) + 1 if thisspelllignite == 1 & (status==3 | status==4)
			by persnr (begepi): g ligexp=sum(durlig)
			sum durlig, d
			
			gen exp_cat = .
			replace exp_cat = 1 if ligexp <= 1095.75 & ligexp !=. // Thereby, we define "low experience" as having worked for at most 3 years in the lignite industry
			replace exp_cat = 2 if ligexp > 1095.75 & ligexp !=. // Thereby, we define "high experience" as having worked for more than 3 years in the lignite industry
			tab exp_cat, m
			
/* -------------------------------------------------------------------------- */
/*     (9.c) 	* Capture date of potential retirement (endepi of last spell) */
/* --------------------------------------------------------------------------- */	
	
		* Capture date of last non-minijob spell by person
		bys persnr (begepi): egen potret_help=max(endepi) if status!=2
		* copy values to all observations
		bys persnr (begepi): egen potret=max(potret_help)
		format potret %tdDDmonYY

		* if last observed spell too close to end of observation window, not potential retirement
		replace potret=mdy(01,01,${maxyear}+1) if (potret > mdy(9,1,${maxyear})) & !missing(potret)
		label var potret "date of last obs of person in data unless 2017 (last wave): potential date of retirement"
		drop potret_help
		
		* Potential age of retirement
		cap drop agepotret
		gen agepotret = year(potret) - year(geb_dat)
		label var agepotret "potential age of retirement at date potret"
		
		di "tab age of potential retirement"
		tab agepotret if firstpers==1, missing
		di "tab age of potential retirement - direct retirement out of unemployment"
		tab agepotret  if thisspelllignite == 1 & (grund == 509 | grund == 1154)
		di "tab age of potential retirement - retirement from employment: end of job (grund=130)"
		tab agepotret  if thisspelllignite == 1 & status==3 & (grund==130)
		di "tab age of potential retirement - Lignite - unemp/margemp/ALMP - retirement"
		tab agepotret  if thisspelllignite == 1 & status==3 & (status[_n+1]==0 | status[_n+1]==1 | status[_n+1]==2)
		
		
/* -------------------------------------------------------------------------- */
/*     (9.d) 	* Capture date of potential early retirement for ATZ cases */
/* --------------------------------------------------------------------------- */		

		************************************************************************
		* (9.d.1) Begin, End and Duration of ATZ (based on erwstat)
		************************************************************************
		
		*** (9d.1a) Begin of ATZ ****
		
		cap drop first103
		cap drop Dfirst103
		
		g Dfirst103=0
		* takes the very first begepi of a erwstat 103 spell -> Dfirst103 is Date of first begepi
		bys persnr (begepi): egen first103=min(begepi) if erwstat==103 
		format first103 %td
		label var first103 "Begin (Date) of ATZ, begepi"
		* generate dummy variable first103 
		replace Dfirst103=1 if (first103==begepi) & begepi!=.
		label var Dfirst103 "dummy first ATZ-observation"
		
		* age distribution at the begin of ATZ with one observation per person
		* take age at begepi here
		
		tab agebeg if Dfirst103==1
		
		cap drop agebeg103
		g agebeg103=.
		replace agebeg103=agebeg if Dfirst103==1
		* copy the agebeg103 to all spells of the person
		bysort persnr (agebeg103) : replace agebeg103 = agebeg103[1]
		label var agebeg103 "Age at begin of ATZ"
		
		*** (9.d.1b) End of ATZ (not always observed)****

		************************************************************************** 
		* Beware: End of Observation period
		* If last observed spell too close to end of observation window, not potential retirement
		*
		* (9d.1b.1) do not assume ppl retire at end of maxyear if this is last spell
		*
		* (9d.1b.2) do not just remove maxyear-spell if last year => 
		*		given we have not collated spells here, this will lead to 
		*		many retirements being imputed for 2016.
		*
		* (9d.1b.3) Route chosen here: 
		*        (i) Recode last spell before retiring as "maxyear+1" (in the future)
		*        (ii) recode as missing last spell, duration, time of retirement if date is in future
		* 
		************************************************************************** 
		cap drop last103
		cap drop Dlast103
		cap drop rightcens103
		
		bys persnr (begepi): egen last103=max(endepi) if erwstat==103
		replace last103= mdy(01,01,${maxyear}+1) if erwstat==103  & (last103 >= mdy(9,1,${maxyear})) & !missing(last103)
		format last103 %td
		label var last103 "End (Date) of ATZ, endepi"
		
		g rightcens103=.
		replace rightcens103=1 if (last103==mdy(01,01,${maxyear}+1))
		
		g Dlast103=0
		replace Dlast103=1 if (last103==endepi) & endepi!=.
		replace Dlast103=. if (last103==mdy(01,01,${maxyear}+1))
		label var Dlast103 "dummy last ATZ-observation unless last date is post-Sept2017 (last wave)"
		
		* age distribution at the end of ATZ with one observation per person
		* take age at endepi here
		tab ageend if Dlast103==1
		
		g person103=.
		replace person103=1 if (erwstat==103)
		bys persnr (person103): replace person103=person103[1]	
		label var person103 "person who will go into early retirement (erwstat103)"
		sort persnr begepi
		
		* number of ppl going into early retirement
		*tab person103 if firstpers==1
		tab Dfirst103
				
		*** (9.d.1c) Duration of ATZ ****

		cap drop dur103
		
		g dur103=.
		* (1) get duration for individuals who are not right-censored by the end of the observation period
		replace dur103=(last103-first103) if Dlast103==1
		
		* (2) for people who are right-censored:  
		* Assume cohorts who go into early retirement 
		* remain in early retirement for the same duration
		* impute duration in ATZ by using non-censored individuals				

		g geb_dec=.
		replace geb_dec=1 if year(geb_dat)<1930 & year(geb_dat)!=.
		replace geb_dec=3 if year(geb_dat)>=1930 & year(geb_dat)<1940
		replace geb_dec=4 if year(geb_dat)>=1940 & year(geb_dat)<1950 
		replace geb_dec=5 if year(geb_dat)>=1950 & year(geb_dat)<1960
		replace geb_dec=6 if year(geb_dat)>=1960 & year(geb_dat)<1970		
		replace geb_dec=7 if year(geb_dat)>=1970 & year(geb_dat)<1980
		replace geb_dec=8 if year(geb_dat)>=1980 & year(geb_dat)<1990 
		replace geb_dec=9 if year(geb_dat)>=1990 & year(geb_dat)<2000		
		replace geb_dec=10 if year(geb_dat)>=2000 & year(geb_dat)!=.
		
		tab geb_dec, missing

		* (2.i) first get duration in ATZ from ATZ-people for who we observe it
		* last103 only available if ppl not right-censored

		reg dur103 i.geb_dec thisspelllignite if Dlast103==1
		predict Pdur103 if (person103==1 & Dlast103!=1)
		label var Pdur103 "imputed duration in days of ATZ (erwstat=103)"
		* NB. we have four or fivve different predicted values by birth cohort
		g Ydur103=Pdur/365
		su Ydur103, det
		drop Ydur103 
		
		g Plast103=.
		label var Plast103 "imputed end of ATZ period for right-censored ppl" 
		replace Plast103=first103+Pdur103 if (person103==1 & rightcens103==1)
		* Note: we may predict that we should have seen final observation of 
		* people in early retirement, ie. durations that are shorter than what we observe...
		* => then assume people are going to retire swiftly...
		replace Plast103= mdy(1,1,${maxyear}+1)+ runiform()*365 if (Plast103<mdy(12,31,${maxyear})) & (rightcens103==1) & (person103==1)
		format Plast103 %td
		* we now have no more individuals for who we predict an end that we do not see
		count if Plast103<mdy(12,31,${maxyear})
		
		* (3) copy values of duration to all spells of the person
		replace dur103=Pdur103 if person103==1 & rightcens103==1
		bysort persnr (dur103): replace dur103=dur103[1] if erwstat==103
		format dur103 %tg
		label var dur103 "duration in days of ATZ (early retirement)"
		* generate duration in years
		cap drop dur103y
		g dur103y=dur103/365
		label var dur103y "duration of early retirement (ATZ erwstat 103); excl 2017"
		bys Dlast103: su dur103y, det
		
		************************************************************************
		* (9.d.2) 	Retirement date for people in ATZ 
		*		Transition from active -> passive phase of early retirement
		************************************************************************
		* 	assume people move from active (working 100%) to passive (working 0%)
		*	ATZ at midpoint of 103 period => point of early retirement! 
		************************************************************************
		*generate midpoint between first begepi and last endepi of 103

		* (1) for people for who we observe end of passive phase (not censored) 
		* use duration in atz

		g mid103=.
		replace mid103=first103 + round(dur103/2) if person103==1
		* copy the midpoint date to all spells of a person
		bys persnr (mid103): replace mid103=mid103[1]
		format mid103 %td
		label var mid103 "midpoint (date) between first begepi and last endepi of ATZ period"
		sort persnr begepi
		
		di "tab grund for ATZ case"		
		tab grund if mid103!=.
		
		*Potential age of retirement for ATZ
		cap drop agepotret103
		gen agepotret103 = year(mid103) - year(geb_dat)
		label var agepotret103 "age of potential retirement for ATZ: age at midpoint between first begepi and last endepi of ATZ period"
		
		di "tab age of potential retirement for ATZ"
		tab agepotret103 if firstpers==1, missing
		tab agepotret agepotret103 if agepotret!=agepotret103 & firstpers==1

		di "tab age of potential retirement for ATZ - direct retirement out of unemployment"
		tab agepotret103  if thisspelllignite == 1 & (grund == 509 | grund == 1154)
		di "tab age of potential retirement for ATZ - retirement from employment: end of job (grund=130)"
		tab agepotret103  if thisspelllignite == 1 & status==3 & (grund==130)
		di "tab age of potential retirement for ATZ - Lignite - unemp/margemp/ALMP - retirement"
		tab agepotret103  if thisspelllignite == 1 & status==3 & (status[_n+1]==0 | status[_n+1]==1 | status[_n+1]==2)
		
		/* 	the date of the midpoint is the actual early retirement date
			while people may appear still to be FT-employed, they have already
			retired after half of their time in early retirement programme ATZ 
			(erwstat=103)													  */
		
		/* 	adjust potret such that all the ppl that have worked in ATZ retire 
			at the midpoint of their ATZ phase and not at the endepi (= potret)
			of it.															  */
		
		replace potret=mid103 if mid103!=.	
		replace agepotret=agepotret103 if mid103!=.
		
		di "tab new age of potential retirement after taking into account ATZ"
		tab agepotret if firstpers==1, missing
		
		* generate potret dummy: 
		g dpotret =.
		* potential retirement is either at end of a spell (if no ATZ) or at mid103 (for ATZ cases)
		* including condition for case 103, to avoid more than one dummy dpotret per person
		*replace dpotret = 1 if ((endepi == potret) | (mid103 == potret)) & (!missing(potret))
		replace dpotret = 1 if ((endepi == potret) | (mid103 == potret & mid103>=begepi & mid103<=endepi)) & (!missing(potret))
		label var dpotret "potential retir't dummy = last non-minijob spell / mid-ATZ for ATZ cases"
		
		di "tab new age of potential retirement - direct retirement out of unemployment"
		tab agepotret  if thisspelllignite == 1 & (grund == 509 | grund == 1154) & (dpotret==1)
		di "tab new age of potential retirement - retirement from employment: end of job (grund=130)"
		tab agepotret  if thisspelllignite == 1 & status==3 & (grund==130) & (dpotret==1)
		di "tab new age of potential retirement - Lignite - unemp/margemp/ALMP - retirement"
		tab agepotret  if thisspelllignite == 1 & status==3 & (status[_n+1]==0 | status[_n+1]==1 | status[_n+1]==2) & (dpotret[_n+1]==1)  

		** check before collating spells 
		* all potret = endepi cases should be included in dpotret =1 
		count if potret==.
		tab dpotret if endepi == potret & (!missing(potret))
		tab dpotret if mid103 == potret & mid103>=begepi & mid103<=endepi & (!missing(potret))
		tab thisspelllignite if (endepi == potret) & potret !=. & dpotret != 1
		tab thisspelllignite if (mid103 == potret) & potret !=. & dpotret != 1
		* How many people do we have who have potret without 130
		tab grund if dpotret==1

/* ------------------------------------------------------------------------ */
/* (10) Black holes                                                          */
/* ------------------------------------------------------------------------ */

	/* Comment: In order to get gapless employment biographies, we try to fill
	existing black holes using the following strategies. */
	/* 	Outline                                                                 */
	* (10.a) Fill if length of black hole is exactly one year and pre- and post-employment is identical
	* (10.b) Fill if length of black hole is at maximum 4 weeks
	* (10.c) Collate very short spells to previous spells ***
	
	/* ------------------------------------------------------------------------ */
	* (10.a) Fill if length of black hole is exactly one year and pre- and post-employment is identical
	/* ------------------------------------------------------------------------ */
	
	sort persnr begepi
	* Generating a dummy that equals one if the gap amounts exactly one year
	gen year_dummy = 0

		* Regular years
		replace year_dummy = 1 if (begepi - endepi[_n-1] - 1) == 365 & (year(begepi) != 1977 & ///
			year(begepi) != 1981 & year(begepi) != 1985 & year(begepi) != 1989 & ///
			year(begepi) != 1993 & year(begepi) != 1997 & year(begepi) != 2001 & ///
			year(begepi) != 2005 & year(begepi) != 2009 & year(begepi) != 2013) & ///
			day(begepi) == 1 & month(begepi) == 1
			
		* Leap years
		replace year_dummy = 1 if (begepi - endepi[_n-1] - 1) == 366 & (year(begepi) == 1977 | ///
			year(begepi) == 1981 | year(begepi) == 1985 | year(begepi) == 1989 | ///
			year(begepi) == 1993 | year(begepi) == 1997 | year(begepi) == 2001 | ///
			year(begepi) == 2005 | year(begepi) == 2009 | year(begepi) == 2013) & ///
			day(begepi) == 1 & month(begepi) == 1
			
	* Generating a dummy that equals one if the employment before and after the gap is the same
	gen equal_emp = 0
	replace equal_emp = 1 if persnr == persnr[_n-1] & betnr == betnr[_n-1] & quelle == 1 & ///
		quelle[_n-1] == 1 & beruf == beruf[_n-1] & erwstat == erwstat[_n-1]			
		
	* Generating new spells for previous black holes and appending them to the existing dataset

		* Save dataset and prepare for filling the black holes
		compress
		save ${data}\employee_outflow_1, replace
		keep if year_dummy == 1 & equal_emp == 1

		* Check the results 
		count
		disp(r(N))
		disp("Number of black holes if exactly one year filled by pre-/post-employment information")

		* Adjust begepi		
			* Regular years
			replace begepi = begepi - 365 
			
			* Leap years
			replace begepi = begepi - 1 if ( ///
				year(begepi) == 1976 | year(begepi) == 1980 | ///
				year(begepi) == 1984 | year(begepi) == 1988 | ///
				year(begepi) == 1992 | year(begepi) == 1996 | ///
				year(begepi) == 2000 | year(begepi) == 2004 | ///
				year(begepi) == 2008 | year(begepi) == 2012 | ///
				year(begepi) == 2017)
		
		* Adjust endepi	
			* Regular years
			replace endepi = begepi + 364
			
			* Leap years
			replace endepi = endepi + 1 if ( ///
				year(begepi) == 1976 | year(begepi) == 1980 | ///
				year(begepi) == 1984 | year(begepi) == 1988 | ///
				year(begepi) == 1992 | year(begepi) == 1996 | ///
				year(endepi) == 2000 | year(endepi) == 2004 | ///
				year(endepi) == 2008 | year(endepi) == 2012 | ///
				year(begepi) == 2017)

		* Save the filling helpfile	
		compress	
		save ${data}\employee_outflow_1_help, replace

	* Append filling helpfile to original dataset in order to fill black holes
	use ${data}\employee_outflow_1, clear
	append using ${data}\employee_outflow_1_help
		
		* Drop auxiliary variables	
		drop year_dummy equal_emp

	/* ------------------------------------------------------------------------ */
	* (10.b) Fill if length of black hole is at maximum 4 weeks (black_hole_gap_month=28 days)
	/* ------------------------------------------------------------------------ */
	
	* Black hole is flled with post-status (marginal, normal, vocational, unemp)...
		sort persnr begepi
		count if (begepi - endepi[_n-1] - 1) < ${black_hole_gap_month} & status[_n-1]>0 & begepi!=. & persnr==persnr[_n-1]
		disp(r(N))
		disp("Number of black holes to be filled given shorter than criterion (black_hole_gap_month)")
		bys persnr (begepi): replace begepi = endepi[_n-1] + 1 if (begepi - endepi[_n-1] - 1) < ${black_hole_gap_month} & status[_n-1]>0 

	* ... unless pre-status is unemployment and post-status not unemployment - then add black hole gap to unemployment
		count if (begepi[_n+1] - endepi - 1) < ${black_hole_gap_month}	& status[_n-1]==0 & endepi!=.
		disp(r(N))
		disp("Number of black holes if exactly one year filled by pre-/post-employment information")
	bys persnr (begepi): replace endepi = (begepi[_n+1] - 1) if (begepi[_n+1] - endepi - 1) < ${black_hole_gap_month}	& status==0
	
sav ${data}/delme2, replace

/* ------------------------------------------------------------------------ */
/* (11) Collating consecutive spells                                         */
/* ------------------------------------------------------------------------ */
/* 	Outline: Consecutive spells are to be joined if                         */
	* (11.a) Identify consecutive spells of vocational training, ALMP and unemp spell (only according to equal status)
	* (11.b) Identify consecutive spells of normalemp, margemp (according to persnr, betnr, beruf, status)
	* (11.c) Collate consecutive spells of both types using a loop
/* ------------------------------------------------------------------------ */
	
		count if endepi == begepi
		cap drop dur
		cap drop total_dur
		gen dur = (endepi - begepi) + 1
		egen total_dur = total(dur)
		sum total_dur
		egen agg_income = total(suminc_all)
		gen total_dur_month = total_dur/tentgeltdays
		gen ave_agg_income = agg_income / total_dur_month
		sum ave_agg_income
		drop agg_income ave_agg_income total_dur_month
		
		* Additional check: Check where retirement information gets lost
		tab grund if (grund == 130 | grund == 150 | grund == 509  | grund == 1154)
		by persnr: egen lastspell = max(spell)
		tab grund if (grund == 130 | grund == 150) & spell == lastspell
		tab grund if (grund == 509  | grund == 1154) & spell == lastspell
		drop lastspell
		
	sort persnr begepi	
	drop dur
	gen dur_day = (endepi - begepi) + 1 
	gen dur_month = ((endepi - begepi) + 1) / tentgeltdays
	gen average_tentgelt = tentgelt // Gen a variable for average tentgelt. If there are no consecutive spells, average tentgelt equals tentgelt
	gen average_suminc_A = suminc_A
	gen average_suminc_B = suminc_B
	gen average_suminc_D = suminc_D
	gen average_suminc_M = suminc_M
	gen average_suminc_R = suminc_R
	gen average_suminc_Z = suminc_Z
	gen average_suminc_all = suminc_all
		
	* (11.a) Identify consecutive spells of vocational training, ALMP & unemp spell (same status is sufficient)
		* Definition of consecutive_u
		* Spells of same person ID (persnr) and status (status) are consecutive.
	
	*  (11.b) Identify consecutive spells of normalemp & margemp (according to persnr, betnr, beruf, status)
		*
		* Criteria for linking spells
		* - same person ID (persnr)
		* - same company ID (betnr)
		* - same datasource (quelle)
		* - same labour market status (statsimple & erwstat - see NB. below)
		* NB. we do not want to link people with different erwstat, since we want to be able to identify
		*     how long people are in early retirement.
		*
		* NB. We do not allow for black holes here, we expect to have closed all relevant black holes in the part above.

	gen consecut_u = 0
	gen consecut_e = 0
	g statsimple=.
	* black hole spells also counted as missing (see below)
	* statsimple 0 means unemp (status 0), ALMP (status 1), or margemp (status 2)
	replace statsimple=0 if (status==0 | status==1 | status==2)
	* statsimple 1 means normelemployment
	replace statsimple=1 if (status==3)
	* statsimple 2 means 
	replace statsimple=2 if (status==4)
	
	replace status=3 if normalemp==1
	replace status=4 if vocational==1

	
	*DelmeMB additional checking : is erwstat==103 always statsimple 1?
	disp("Check of statsimple if erwstat==103")
	tab statsimple if erwstat==103

	label var statsimple "0 - unemployed, margemp or ALMP / 1 - employed / 2 - vocational"
	replace consecut_u = 1 if statsimple==0
	replace consecut_e = 1 if statsimple==1
	
	bys persnr (begepi): replace consecut_u = consecut_u[_n-1] + consecut_u if consecut_u == 1 & consecut_u[_n-1] != . & ///
										statsimple==statsimple[_n-1] & (statsimple==0) & (begepi - endepi[_n-1]) == 1
	
	bys persnr (begepi): replace consecut_e = consecut_e[_n-1] + consecut_e if consecut_e == 1 & consecut_e[_n-1] != . & ///
										statsimple==statsimple[_n-1] & (statsimple==1) & (begepi - endepi[_n-1]) == 1 ///
										& betnr == betnr[_n-1] & quelle == quelle[_n-1] & erwstat == erwstat[_n-1] 
	
	gen consecut = consecut_u + consecut_e
	replace consecut = . if consecut == 0
		
		* Check that consectuve_u/e are mutually exclusive
		count if consecut_u > 0 & consecut_e > 0
		count if consecut > consecut_u & consecut_u != .
		count if consecut > consecut_e & consecut_e != .
		sum consecut_u
		sum consecut_e

		sav ${data}/data1prep1, replace

rename ageend ageend2
g ageend=year(endepi) - year(geb_dat)
g diffages=ageend2-ageend
su diffages, detail

tab erwstat if thisspelllignite==1 & year(endepi)>2010 & ageend<49 
tab erwstat if thisspelllignite==1 & year(endepi)>2010 & ageend>49 & ageend<55 
tab erwstat if thisspelllignite==1 & year(endepi)>2010 & ageend>55 & ageend<60 
tab erwstat if thisspelllignite==1 & year(endepi)>2010 & ageend>60

disp("Note this is distribution of spells not individuals (spells will be combined in next step)")
if ${iab}==1{
tab stib if thisspelllignite==1 & year(endepi)>2010 & ageend<49 
tab stib if thisspelllignite==1 & year(endepi)>2010 & ageend>49 & ageend<55 
tab stib if thisspelllignite==1 & year(endepi)>2010 & ageend>55 & ageend<60 
tab stib if thisspelllignite==1 & year(endepi)>2010 & ageend>60		
}
	
		**************************************
		* (11.c) Collate consecutive spells 
		**************************************
		* First count the maximum number of consecutive spells. This number is then used for a loop
		* in which two consecutive spells are collated each time. 
		
		* The default is the value of the start of the spell - we now need to track the income at the end of the period.
		gen tentg_beg=tentgelt

		* also follow up on pre-imputed value of tentgelt
		ge tentbeg_preimp=wage_preimp_permonth

	sum consecut
	local end = `r(max)'
	di `end'
	bys persnr (begepi): gen first_consecut = 1 if consecut==1 & consecut[_n+1]==2

	gen tentg_end=tentgelt
	gen tentend_preimp=wage_preimp

	forvalues i = 2/`end' {
	
		* Save income information
		replace dur_day = (endepi - begepi) + 1 // Here we redefine the length of spells in days
		replace dur_month = ((endepi - begepi) + 1) / tentgeltdays
		
		bys persnr (begepi): replace average_tentgelt = ((average_tentgelt*dur_day)+(tentgelt[_n+1]*dur_day[_n+1]))/(dur_day+dur_day[_n+1]) if consecut[_n+1] == `i' & first_consecut == 1 
		// Here we calculate the average tentgelt
		bys persnr (begepi): replace average_suminc_A = ((average_suminc_A*dur_month)+(suminc_A[_n+1]*dur_month[_n+1]))/(dur_month+dur_month[_n+1]) if consecut[_n+1] == `i' & first_consecut == 1
		bys persnr (begepi): replace average_suminc_B = ((average_suminc_B*dur_month)+(suminc_B[_n+1]*dur_month[_n+1]))/(dur_month+dur_month[_n+1]) if consecut[_n+1] == `i' & first_consecut == 1
		bys persnr (begepi): replace average_suminc_D = ((average_suminc_D*dur_month)+(suminc_D[_n+1]*dur_month[_n+1]))/(dur_month+dur_month[_n+1]) if consecut[_n+1] == `i' & first_consecut == 1
		bys persnr (begepi): replace average_suminc_M = ((average_suminc_M*dur_month)+(suminc_M[_n+1]*dur_month[_n+1]))/(dur_month+dur_month[_n+1]) if consecut[_n+1] == `i' & first_consecut == 1
		bys persnr (begepi): replace average_suminc_R = ((average_suminc_R*dur_month)+(suminc_R[_n+1]*dur_month[_n+1]))/(dur_month+dur_month[_n+1]) if consecut[_n+1] == `i' & first_consecut == 1
		bys persnr (begepi): replace average_suminc_Z = ((average_suminc_Z*dur_month)+(suminc_Z[_n+1]*dur_month[_n+1]))/(dur_month+dur_month[_n+1]) if consecut[_n+1] == `i' & first_consecut == 1
		bys persnr (begepi): replace average_suminc_all = ((average_suminc_all*dur_month)+(suminc_all[_n+1]*dur_month[_n+1]))/(dur_month+dur_month[_n+1]) if consecut[_n+1] == `i' & first_consecut == 1

		* Calculate wage growth 
		* take average change per year
		* report average wage growth per age category

		
		
		* Save income information for every spell
			* bys persnr (begepi): gen inc`i'_tentgelt = tentgelt[_n+1] if begep>=1.1. // Here we save the exact income information from the incorporated spells as well as its duration information
			* bys persnr (begepi): gen inc`i'_begepi   = begepi[_n+1] if consecut[_n+1] == `i' & first_consecut == 1
			* bys persnr (begepi): gen inc`i'_endepi   = endepi[_n+1] if consecut[_n+1] == `i' & first_consecut == 1
			* bys persnr (begepi): gen inc`i'_dur_day  = (endepi[_n+1] - begepi[_n+1]) + 1 if consecut[_n+1] == `i' & consecut == 1

		* Copy to first line of multiple line-spell endepi, grund & income (to have endepi, grund & income of last spell)
		* Copy to first spell the reason of end of the last spell - important for identifying transitions to retirement
		bys persnr (begepi): replace grund = grund[_n+1] if consecut[_n+1] == `i' & first_consecut == 1 
		bys persnr (begepi): replace endepi = endepi[_n+1] if consecut[_n+1] == `i' & first_consecut == 1 
		bys persnr (begepi): replace tentg_end = tentgelt[_n+1] if consecut[_n+1] == `i' & first_consecut == 1
		bys persnr (begepi): replace tentend_preimp = wage_preimp[_n+1] if consecut[_n+1] == `i' & first_consecut == 1

		* Drop the current spell if part of series of consecutive spells - replace by former line of code
		*bys persnr (begepi): drop if consecut == `i' & consecut[_n-1]==`i'-1
		bys persnr (begepi): drop if consecut == `i' & first_consecut[_n-1] == 1
		}

		************************************************************************
		** check after collating spells 
		* all potret = endepi cases should be included in dpotret =1 
		tab agepotret if firstpers==1, missing
		count if potret==.
		tab dpotret if endepi == potret & (!missing(potret))
		tab dpotret if mid103 == potret & (mid103>=begepi) & (mid103<=endepi) & (!missing(potret))
		tab thisspelllignite if (endepi == potret) & (potret !=.) & dpotret != 1
		tab thisspelllignite if (mid103 == potret) & (potret !=.) & dpotret != 1
		* How many people do we have who have potret without 130
		tab grund if dpotret==1
		************************************************************************

		* make sure dpotret includes endepi == potret cases
		* needs to be replaced after collating spells
		*replace dpotret = 1 if (endepi == potret) & !missing(potret)
		*replace dpotret = 1 if (endepi == mid103) & (person103 == 1) & !missing(potret)
		replace dpotret = 1 if ((endepi == potret) | (mid103 == potret & mid103>=begepi & mid103<=endepi)) & (!missing(potret))

		cap drop agepotret
		g agepotret =.
		forvalues j = 49/80 {
		* (1) determine start and end of biological age j \in [ 49,...,80 ]
		di "date of begin and end being `j' years old"
			g bega`j' = mdy(month(geb_dat),day(geb_dat),year(geb_dat)+`j')
			format bega`j' %td
			g enda`j' = mdy(month(geb_dat),day(geb_dat),year(geb_dat)+`j'+1)-1
			format enda`j' %td
		* (2) dummy: does potential retirement in biological age j fall into a given spell
			g dagepotret`j' = 0
			replace dagepotret`j' = 1 if (bega`j' <= potret & enda`j' >= potret) ///
			& !missing(potret) & !missing(bega`j') & !missing(enda`j') ///
			& dpotret == 1
		* (3) biological age in which retirement falls
			replace agepotret = `j' if dagepotret`j' == 1
			bysort persnr (agepotret): replace agepotret = agepotret[1]
			cap drop dagepotret`j'
			} 
				
		* Check the results
		tab consecut // There should only be zeros & ones here 
		sum tentgelt if tentgelt>0, detail
		sum average_tentgelt if average_tentgelt>0, detail
		sum suminc_A if suminc_A>0, detail
		sum average_suminc_A if average_suminc_A>0, detail
		sum suminc_B if suminc_B>0, detail
		sum average_suminc_B if average_suminc_B>0, detail
		sum suminc_D if suminc_D>0, detail
		sum average_suminc_D  if average_suminc_D>0, detail
		sum suminc_M if suminc_M>0, detail
		sum average_suminc_M if average_suminc_M>0, detail
		sum suminc_R if suminc_R>0, detail
		sum average_suminc_R if average_suminc_R>0, detail
		sum suminc_Z if suminc_Z>0, detail
		sum average_suminc_Z if average_suminc_Z>0, detail
		sum suminc_all if suminc_all>0, detail
		sum average_suminc_all if average_suminc_all>0, detail
		
		drop consecut first_consecut
	
		* Postcheck
		count if endepi < begepi
		count if endepi == begepi

		* redefine duration & age *
		cap drop dur
		cap drop total_dur
		gen dur = (endepi - begepi) + 1
		egen total_dur = total(dur)
		sum total_dur
		egen agg_income = total(average_suminc_all)
		gen total_dur_month = total_dur/tentgeltdays
		gen ave_agg_income = agg_income / total_dur_month
		sum ave_agg_income
		drop agg_income ave_agg_income total_dur_month
		cap drop ageend
		cap drop agebeg
		g ageend= year(endepi)-year(geb_dat)
		g agebeg= year(begepi)-year(geb_dat)
		
		cap drop ageendcat
		gen ageendcat = 1 if ageend >= 18 & ageend <= 30
		replace ageendcat = 2 if ageend > 30 & ageend <= 50
		replace ageendcat = 3 if ageend > 50
		label var ageendcat "age at end of spell by broad category"
		label define ageendcatLAB 1 "18-30 years old" 2 "30-50 years old" 3 "over 50 years old", modify
		label values ageendcat ageendcatLAB
		tab ageendcat if firstpers==1, m
		
		* Additional check: Check where retirement information gets lost
		tab grund if (grund == 130 | grund == 150)
		tab grund if (grund == 509  | grund == 1154)
		by persnr: egen lastspell = max(spell)
		tab grund if (grund == 130 | grund == 150) & spell == lastspell
		tab grund if (grund == 509  | grund == 1154) & spell == lastspell
		drop lastspell

/* ------------------------------------------------------------------------ */
/* (12) Generate datasets for estimation                                    */
/* ------------------------------------------------------------------------ */
/* Outline
	* (12.a) Drop individuals with too long or too many black holes
	*		(All the following datasets build on dataset created in 11a.)
			-> postprep_1
	* (12.b) Sample that contains all remaining information for each individual,
	* 		i.e. black holes are an additional spell assigned the status 10 - 'black hole'.
			-> postprep_2
	* (12.c) Select only one series per person (no interruptions with long black holes):
	*		1) only lignite series, 2) longest series if multiple lignite series
			-> postprep_3
	* (12.d) Select only one series per person (no interruptions with long black holes): 
	*		1) only lignite series, 2) series with at least one transition (into or out of lignite)
			3) longest series if multiple lignite transition series
			-> postprep_4
	* ------------------------------------------------------------------------ */
	
	/* After the above data preparation is completed, we obtain a deparallelized dataset including all relevant socioeconomic and income information.
	But, since the part on black holes only prepares some of the gaps (between the end and the beginning of two consecutive spells)
	that may be included in some histories of employment due to several reasons (e.g. appointment as civil servant, maternity leave etc.),
	we need to finally prepare black holes/gaps to obtain a gapless and deparallelized dataset.
	
	We achieve a gapless dataset in two steps:
	
	- First, we drop individuals with a certain duration of black holes in their history of employment or with simply too much black holes.
	- Second, we split the remaining fragmented (i.e. interrupted by black holes) histories of employment into several subsegments at the spot of a black hole.
	
	More details on how these rules are implemented can be found below. */
		
	/* ------------------------------------------------------------------------ */
	 * (12.a) Drop individuals with too long or too many black holes
	/* ------------------------------------------------------------------------ */
		/* drop individuals ...
			(1) whose history of employment exhibits a share of black hole duration
				of more than 100%
			(2) whose history of employment contains more than 20 black holes */
	
	* Generate a variable that captures the sum of black hole duration by person
	cap drop black_hole_dur
		di "Calculate the duration of each single black hole (if there is one)"
	bys persnr (begepi): gen black_hole_dur = (begepi - endepi[_n-1]) - 1
		di "Sum black hole duration by person"
	by persnr: egen black_hole_sum = sum(black_hole_dur)
	drop black_hole_dur
	
	* Generate a variable that captures the total length of an individuals' history of employment
	cap drop last_spell
		di "Redefine the spell counting variable"
	bys persnr (begepi): replace spell = _n
		di "Capture the number of the last spell by person"
	bys persnr: egen last_spell = max(spell)
		di "Capture the begin of each history of employment"
	bys persnr: gen begin_history_help = begepi if spell == 1
	bys persnr: egen begin_history = max(begin_history_help)
	drop begin_history_help
		di "Capture the end of each history of employment"
	by persnr: gen end_history_help = endepi if spell == last_spell
	by persnr: egen end_history = max(end_history_help)
	drop end_history_help
		di "Calculate the total length of each history of employment"
	gen history_sum = 0
	replace history_sum = (end_history - begin_history) + 1
	drop last_spell begin_history end_history
	
	* Generate a variable that captures the percentage of black hole duration within an individuals' history of employment
	gen black_hole_share = black_hole_sum / history_sum 
	drop black_hole_sum history_sum
	
	* Generate a variable that counts the number of black holes per history of employment (that is, per person)
		di "Calculate the duration of each single black hole (if there is one)"
	bys persnr (begepi): gen black_hole_dur = (begepi - endepi[_n-1]) - 1
		di "Calculate a dummy that equals one if there is a black hole (even if it is only one day!)"
	gen bhdummy = 0
	replace bhdummy = 1 if black_hole_dur > 0 & black_hole_dur != .
		di "Capture the number of black holes by person"
	by persnr: egen black_hole_number = sum(bhdummy)
	drop bhdummy black_hole_dur
	
		* Check the results
			di "Descriptives of the share of black holes"
		sum black_hole_share
			di "Plot histogram of the share of black holes"
		label variable black_hole_share "Share of black hole duration of the history of employment"
		hist black_hole_share
			di "Capture the number of affected vs unaffected spells (from dropping individuals by the number of black holes)"
		tab black_hole_number
	
	* Drop individuals with more than x% of black holes or more than X black holes
	* Check current thresholds in the globals defined in 00master / 0test
	* [ Latest was 100% (remove no one) and 20 black hole spells (remove less 1%)
	drop if black_hole_share > $bhcritpct
	drop if black_hole_number > $bhcritnr
	drop black_hole_share black_hole_number
	
	**************************************************************************
	* Save sample: all individuals except for those with too many black holes  
	**************************************************************************
	
	sav ${data}/postprep_1.dta, replace
	
	/* ------------------------------------------------------------------------ */
	* (12b) Generate a sample that contains all remaining information for each individual,
	* 		i.e. black holes are an additional spell assigned the status 10 - 'black hole'. 
	* 
	* 		-> This will generate postprep_2.dta
	*		
	/* ------------------------------------------------------------------------ */
	
	* If we want to use several parts of employment history which are separated
	* by black holes, ONE option is to generate a gap-less data set that 
	* contains 'black hole' as a status if a black hole exists.
	
	* The following code results in a data set that contains 'one (gap-less) series' 
	* per individual (no spells dropped).   
	* Using this sample, all transitions of an individual are counted.
	
	* ------------------------------------------------------------------------ *
	*  Non-participation / Black Holes
	* ------------------------------------------------------------------------ *
	* Covering: 
	*		(i) ppl returning to dataset
	*       (ii) spells not covered by cleanning of short spells (<4 weeks)
	*       (iii) spells not covered by cleanning of exactly-one-year-spells

		cap drop black_hole_dur
		gen black_hole_dur = .
		bys persnr (begepi): replace black_hole_dur = (begepi - endepi[_n-1]) - 1

		*	black_hole_dur lists number of days gone *prior* to begin of this spell
		*	- after the end of the preceding spell
		*  => need to append new spell starting at endepi of spell preceeding black_hole
		*  => add new non-participation spell in line where bhdur>0
		
		cap drop bhadd
		g bhadd=.
		sort persnr begepi
		replace bhadd=1 if (black_hole_dur>0 & !mi(black_hole_dur) & persnr==persnr[_n-1])
		label var bhadd ``black hole spell is missing above (=before) this one''
		cap drop bhbegepi
		g bhbegepi=. 
		sort persnr begepi
		replace bhbegepi = endepi[_n-1]+1 if (black_hole_dur>0 & !mi(black_hole_dur) & (persnr==persnr[_n-1]))
		cap drop bhendepi
		g bhendepi=.
		replace bhendepi = begepi-1 if (black_hole_dur>0 & black_hole_dur!=.)
		
		* Save dataset and prepare for filling the black holes
		compress
		save ${data}\employee_outflow_2, replace

		* we keep only spells which are preceeded by a black hole
		keep if bhadd==1

		* we now replace these spells preceeded by a black hole by the black hole itself

		* black hole spell has no information 
		* => delete info from what will be following spell
		* (Create list of vars on individual characteristics that remain valid in bh spell
		ds persnr frau nation bild schule wo_kreis wo_bula wo_aa ao_bula ao_kreis geb_dat potret agepotret dpotret person103 mid103 agepotret103 bhbegepi bhendepi, not

		* remove values from all vars except individual characteristics
		foreach v of var `r(varlist)' { 
		replace `v'=.
		} 
		* status now black hole =10
		replace status=10
		label define status 10 `"black hole"', modify
		replace statsimple=.
		replace grund=10
		label define grund 10 `"black hole"', modify
		
		/* This replaces characteristics of black hole spells to missing
		=> Characteristics of black hole spell are otherwise not correct
		=> Their realization would correspond to those of the spell following the black hole.
		- One example: tentgelt would have been the tentgelt of the following period although we have
		no information of tentgelt in the black hole spell. Summing over tentgelt would therefore
		lead to incorrect results. */

		replace begepi=bhbegepi
		replace endepi=bhendepi
		
		replace dur=(endepi-begepi)+1

		* Drop auxiliary variables	
		drop bhadd bhbegepi bhendepi black_hole_dur
	
		* Check the results 
		count
		disp(r(N))
		disp("Number of black holes interpreted as non-participation spells")

		* Save the filling helpfile	
		compress	
		save ${data}\employee_outflow_2_help, replace

	* Append filling helpfile to original dataset in order to fill black holes
	use ${data}\employee_outflow_2, clear
	append using ${data}\employee_outflow_2_help
	
	* drop firstpers - the value may no longer be valid
	* (we may have created a bh-spell prior to first one)
	* bys persnr begepi: g firstpers=(_n==1)
	drop firstpers
	bys persnr (begepi): g firstpers=1 if [_n==1]
	label var firstpers "first spell of that person in sample"
	drop lastpers
	bys persnr (begepi): g lastpers=1 if [_n==_N]
	label var lastpers "last spell of that person in sample"
	
	* Generate new spell variable
	cap drop spell
	gen spell=.
	bysort persnr (begepi): replace spell = _n	
	
	* we need no new identifier since we only have one series per person
	cap drop pid
	g pid=persnr

	*** check after collating spells
	count if potret==.
	tab dpotret if endepi == potret & (!missing(potret))
	tab dpotret if mid103 == potret & mid103>=begepi & mid103<=endepi & (!missing(potret))
	tab thisspelllignite if (endepi == potret) & dpotret != 1 & potret !=.
	tab thisspelllignite if (mid103 == potret) & dpotret != 1 & potret !=.
	
	sav ${data}/postprep_2, replace  
	
	/* ------------------------------------------------------------------------ */
	* (12.c) Select only one series per person (no interruptions with long black holes) 
	*		(if $bhcritseries==1)
	*
	* 		Selection criterion (1) 
	*				We select only lignite series
	*
	* 		...If one individual has several series...
	*
	*		Selection criterion (2) - Option 1/2
	*				(Option 1/2) We use *longest series* 
	*
	* 				=> This will generate postprep_3.dta
	*				(other option 2/2 below)
	*
	*		REMARK: It has been suggested to keep the most recent series, 
	*				but this risks selecting lignite series that terminate at the end of the 
	*				observation period & that terminate with retirement.	  
	*		
	/* ------------------------------------------------------------------------ */
	
	** load data generated after 11a
	use data/postprep_1.dta, clear

	** Generate a dummy that equals one if there is a black hole between this spell and the previous one
	di "Calculate the duration of each single black hole (if there is one)"
		bys persnr (begepi): gen black_hole_dur = (begepi - endepi[_n-1]) - 1
	di "Calculate a dummy that equals one if there is a black hole (even if it is only one day!)"
		gen bhdummy = 0
		replace bhdummy = 1 if black_hole_dur > 0 & black_hole_dur != .
	
	** Generate a variable that counts upwards at each black hole (i.e. it starts at "1" and stays "1" until there is a black hole, then it changes to "2" until the next black hole and so on...)
	** Let us call "series" that collection of consecutive spells in which there is no black hole. 
		gen bhcounter = .
		label var bhcounter "cumulative counter of series per person"
		di "For the first spell of each series, we set the counter to zero"
		replace bhcounter = 0 if spell == 1

		* Note, the loop runs exactly "bhcritnr" times as no person included in the data should have more than this number of black holes
		local i = 0
			while `i' < $bhcritnr + 1 { 
			replace bhcounter = bhcounter[_n-1]     if bhdummy == 0 & bhcounter[_n-1] !=. & bhcounter == .
			replace bhcounter = bhcounter[_n-1] + 1 if bhdummy == 1 & bhcounter[_n-1] !=. & bhcounter == .
			local i = `i' + 1
			}
** We now want to identify series of spells WITHOUT black holes in which we have lignite.
** we use bhcritseries of these series of spells in our sample to determine how many series per individual we want to keep. 

bys persnr bhcounter: egen ligseries=max(somelignite)

*(1) Select only series that contain lignite spells
drop if ligseries==0

save data/delprep_23.dta, replace

 if $bhcritseries==1 {
 
* (2) Option (1/2): Select the longest series per person

 cap drop minbhcounter maxbhcounter

 * Again: Use this to see whether individuals *STILL* have several spells
bys persnr: egen maxbhcounter=max(bhcounter)
bys persnr: egen minbhcounter=min(bhcounter)
count if maxbhcounter!=minbhcounter

* start and end points of series
bys persnr bhcounter:  egen lastinseries=max(spell)
label var lastinseries "last spell in series"
bys persnr bhcounter:  egen firstinseries=min(spell)
label var firstinseries "first spell in series"
g Dseriestart=.
g Dserieend=.
replace Dseriestart=begepi if spell==firstinseries
replace Dserieend=endepi if spell==lastinseries

* copy start and end dates to all spells in series
bys persnr bhcounter: egen seriestart=mean(Dseriestart)							
bys persnr bhcounter: egen serieend=mean(Dserieend)							

* get duration of series
g seriedur=.
replace seriedur=serieend-seriestart

* get longest-lasting series
bys persnr: egen seriedurmax=max(seriedur)

* keep only the longest series
g Dseriedurmax=.
replace Dseriedurmax=1 if seriedurmax==seriedur 
replace Dseriedurmax=0 if seriedur<seriedurmax
drop if (Dseriedurmax==0 & minbhcounter!=maxbhcounter)
count

* we need no new identifier since we only have one series per person
* in case we set bhcritseries=1, we simply copy persnr to pid
cap drop pid
g pid=persnr

sav data/postprep_3.dta, replace
}
 
	/* ------------------------------------------------------------------------ */
	* (12.d) Select only one series per person to use and drop others 
	*		(if $bhcritseries==1)
	*		Option (2/2) 
	*		- if only one series, use that one.
	*		- if several series, use *series with at least one transition out of lignite*
	*		- if several series with transitions out of lignite, use longest one
	* 		This generates sample postprep_4.dta
	/* ------------------------------------------------------------------------ */
	* load data generated after first preparations in 11c

	use data/delprep_23.dta, clear

*(12d.1) * If multiple series per person, f
		* and if there is one/several series with no transition and 
		* and if there is a series with at least one transition *out* of coal
		* then: choose the series that has a maximum of transitions out of coal 

cap drop posttransout
g posttransout = . 
bys persnr bhcounter:  replace posttransout=1 if thisspelllignite!=1 & thisspelllignite[_n-1]==1 & persnr==persnr[_n-1]
label var posttransout "dummy for period after transition out of lignite"
cap drop posttransin

cap drop transoutseries
bysort persnr bhcounter: egen transoutseries = sum(posttransout) 
tab transoutseries
label var transoutseries "number of transitions per series out of lignite"

	* We need to recount the number of spels & series since dropping all non-lignite series.
		cap drop spell
		gen spell=.
		bysort persnr (begepi): replace spell = _n	
		* Recount series again
		cap drop bhcounter
		gen bhcounter = .
		label var bhcounter "cumulative counter of series per person"
		di "For the first spell of each series, we set the counter to zero"
		replace bhcounter = 0 if spell == 1
		local i = 0
			while `i' < $bhcritnr + 1 { 
			replace bhcounter = bhcounter[_n-1]     if bhdummy == 0 & bhcounter[_n-1] !=. & bhcounter == .
			replace bhcounter = bhcounter[_n-1] + 1 if bhdummy == 1 & bhcounter[_n-1] !=. & bhcounter == .
			local i = `i' + 1
			}			
		cap drop minbhcounter maxbhcounter
		bys persnr: egen maxbhcounter=max(bhcounter)
		bys persnr: egen minbhcounter=min(bhcounter)
		count if maxbhcounter!=minbhcounter

		* is there a series which has a transition in it? 
		* If all series have no transition, keep longest simply (below).
		* ( Avoid dropping all series if all have no transition )
		by persnr: egen maxtransoutseries = max(transoutseries)

		drop if ((minbhcounter!=maxbhcounter)&(transoutseries!=maxtransoutseries))
		count 

* (3) If (still) several series per person (i.e. if several series with equal - maximum - number of transitions), keep longest series

	*recount spells and series
			cap drop spell
			gen spell=.
			bysort persnr (begepi): replace spell = _n	
			cap drop bhcounter
			gen bhcounter = .
			label var bhcounter "cumulative counter of series per person"
			di "For the first spell of each series, we set the counter to zero"
			replace bhcounter = 0 if spell == 1
			local i = 0
			while `i' < $bhcritnr + 1 { 
			replace bhcounter = bhcounter[_n-1]     if bhdummy == 0 & bhcounter[_n-1] !=. & bhcounter == .
			replace bhcounter = bhcounter[_n-1] + 1 if bhdummy == 1 & bhcounter[_n-1] !=. & bhcounter == .
			local i = `i' + 1
			}			
			cap drop minbhcounter maxbhcounter
			bys persnr: egen maxbhcounter=max(bhcounter)
			bys persnr: egen minbhcounter=min(bhcounter)
			count if maxbhcounter!=minbhcounter
			bys persnr bhcounter:  egen lastinseries=max(spell)
			label var lastinseries "last spell in series"
			bys persnr bhcounter:  egen firstinseries=min(spell)
			label var firstinseries "first spell in series"
			g Dseriestart=.
			g Dserieend=.
			replace Dseriestart=begepi if spell==firstinseries
			replace Dserieend=endepi if spell==lastinseries

			
	* get duration of series
			* copy start and end dates to all spells in series
			bys persnr bhcounter: egen seriestart=mean(Dseriestart)							
			bys persnr bhcounter: egen serieend=mean(Dserieend)							
			g seriedur=.
			replace seriedur=serieend-seriestart
			* get longest-lasting series
			bys persnr: egen seriedurmax=max(seriedur)
			g Dseriedurmax=.
			replace Dseriedurmax=1 if (seriedurmax==seriedur)
			replace Dseriedurmax=0 if (seriedur<seriedurmax & seriedurmax!=. & seriedur!=.)
			
			drop if (Dseriedurmax==0) & (minbhcounter!=maxbhcounter)
			count
			* we need no new identifier since we only have one series per person
			* in case we set bhcritseries=1, we simply copy persnr to pid
			cap drop pid
			g pid=persnr

			sav data/postprep_4.dta, replace	
			
** CLOSE LOG-FILE **	
if ${log_active}==1 {$
	capture log close
	}
